| |
| <!DOCTYPE html> |
| |
| <html> |
| <head> |
| <meta charset="utf-8" /> |
| <title>pyspark.sql.functions — PySpark master documentation</title> |
| |
| <link href="../../../_static/styles/theme.css?digest=1999514e3f237ded88cf" rel="stylesheet"> |
| <link href="../../../_static/styles/pydata-sphinx-theme.css?digest=1999514e3f237ded88cf" rel="stylesheet"> |
| |
| |
| <link rel="stylesheet" |
| href="../../../_static/vendor/fontawesome/5.13.0/css/all.min.css"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2"> |
| |
| |
| |
| |
| |
| <link rel="stylesheet" href="../../../_static/styles/pydata-sphinx-theme.css" type="text/css" /> |
| <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" /> |
| |
| <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf"> |
| |
| <script id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script> |
| <script src="../../../_static/jquery.js"></script> |
| <script src="../../../_static/underscore.js"></script> |
| <script src="../../../_static/doctools.js"></script> |
| <script src="../../../_static/language_data.js"></script> |
| <script src="../../../_static/clipboard.min.js"></script> |
| <script src="../../../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script> |
| <script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "tex2jax_ignore|mathjax_ignore|document", "processClass": "tex2jax_process|mathjax_process|math|output_area"}})</script> |
| <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/sql/functions.html" /> |
| <link rel="search" title="Search" href="../../../search.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="None"> |
| |
| |
| <!-- Google Analytics --> |
| |
| </head> |
| <body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80"> |
| |
| <div class="container-fluid" id="banner"></div> |
| |
| |
| <nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"><div class="container-xl"> |
| |
| <div id="navbar-start"> |
| |
| |
| |
| <a class="navbar-brand" href="../../../index.html"> |
| <img src="../../../_static/spark-logo-reverse.png" class="logo" alt="logo"> |
| </a> |
| |
| |
| |
| </div> |
| |
| <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-collapsible" aria-controls="navbar-collapsible" aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| |
| |
| <div id="navbar-collapsible" class="col-lg-9 collapse navbar-collapse"> |
| <div id="navbar-center" class="mr-auto"> |
| |
| <div class="navbar-center-item"> |
| <ul id="navbar-main-elements" class="navbar-nav"> |
| <li class="toctree-l1 nav-item"> |
| <a class="reference internal nav-link" href="../../../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| <li class="toctree-l1 nav-item"> |
| <a class="reference internal nav-link" href="../../../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| <li class="toctree-l1 nav-item"> |
| <a class="reference internal nav-link" href="../../../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| <li class="toctree-l1 nav-item"> |
| <a class="reference internal nav-link" href="../../../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| <li class="toctree-l1 nav-item"> |
| <a class="reference internal nav-link" href="../../../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| <li class="toctree-l1 nav-item"> |
| <a class="reference internal nav-link" href="../../../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| |
| </ul> |
| </div> |
| |
| </div> |
| |
| <div id="navbar-end"> |
| |
| <div class="navbar-end-item"> |
| <!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| master |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "_modules/pyspark/sql/functions.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("_static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script> |
| </div> |
| |
| </div> |
| </div> |
| </div> |
| </nav> |
| |
| |
| <div class="container-xl"> |
| <div class="row"> |
| |
| |
| <!-- Only show if we have sidebars configured, else just a small margin --> |
| <div class="col-12 col-md-3 bd-sidebar"> |
| <div class="sidebar-start-items"><form class="bd-search d-flex align-items-center" action="../../../search.html" method="get"> |
| <i class="icon fas fa-search"></i> |
| <input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" > |
| </form><nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation"> |
| <div class="bd-toc-item active"> |
| |
| </div> |
| </nav> |
| </div> |
| <div class="sidebar-end-items"> |
| </div> |
| </div> |
| |
| |
| |
| |
| <div class="d-none d-xl-block col-xl-2 bd-toc"> |
| |
| </div> |
| |
| |
| |
| |
| |
| |
| <main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main"> |
| |
| <div> |
| |
| <h1>Source code for pyspark.sql.functions</h1><div class="highlight"><pre> |
| <span></span><span class="c1">#</span> |
| <span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span> |
| <span class="c1"># contributor license agreements. See the NOTICE file distributed with</span> |
| <span class="c1"># this work for additional information regarding copyright ownership.</span> |
| <span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span> |
| <span class="c1"># (the "License"); you may not use this file except in compliance with</span> |
| <span class="c1"># the License. You may obtain a copy of the License at</span> |
| <span class="c1">#</span> |
| <span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span> |
| <span class="c1">#</span> |
| <span class="c1"># Unless required by applicable law or agreed to in writing, software</span> |
| <span class="c1"># distributed under the License is distributed on an "AS IS" BASIS,</span> |
| <span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span> |
| <span class="c1"># See the License for the specific language governing permissions and</span> |
| <span class="c1"># limitations under the License.</span> |
| <span class="c1">#</span> |
| |
| <span class="sd">"""</span> |
| <span class="sd">A collections of builtin functions</span> |
| <span class="sd">"""</span> |
| <span class="kn">import</span> <span class="nn">inspect</span> |
| <span class="kn">import</span> <span class="nn">decimal</span> |
| <span class="kn">import</span> <span class="nn">sys</span> |
| <span class="kn">import</span> <span class="nn">functools</span> |
| <span class="kn">import</span> <span class="nn">warnings</span> |
| <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">Any</span><span class="p">,</span> |
| <span class="n">cast</span><span class="p">,</span> |
| <span class="n">Callable</span><span class="p">,</span> |
| <span class="n">Dict</span><span class="p">,</span> |
| <span class="n">List</span><span class="p">,</span> |
| <span class="n">Iterable</span><span class="p">,</span> |
| <span class="n">overload</span><span class="p">,</span> |
| <span class="n">Optional</span><span class="p">,</span> |
| <span class="n">Tuple</span><span class="p">,</span> |
| <span class="n">Type</span><span class="p">,</span> |
| <span class="n">TYPE_CHECKING</span><span class="p">,</span> |
| <span class="n">Union</span><span class="p">,</span> |
| <span class="n">ValuesView</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="kn">from</span> <span class="nn">py4j.java_gateway</span> <span class="kn">import</span> <span class="n">JVMView</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">SparkContext</span> |
| <span class="kn">from</span> <span class="nn">pyspark.errors</span> <span class="kn">import</span> <span class="n">PySparkTypeError</span><span class="p">,</span> <span class="n">PySparkValueError</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.column</span> <span class="kn">import</span> <span class="n">Column</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">,</span> <span class="n">_create_column_from_literal</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.dataframe</span> <span class="kn">import</span> <span class="n">DataFrame</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">ArrayType</span><span class="p">,</span> <span class="n">DataType</span><span class="p">,</span> <span class="n">StringType</span><span class="p">,</span> <span class="n">StructType</span><span class="p">,</span> <span class="n">_from_numpy_type</span> |
| |
| <span class="c1"># Keep UserDefinedFunction import for backwards compatible import; moved in SPARK-22409</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.udf</span> <span class="kn">import</span> <span class="n">UserDefinedFunction</span><span class="p">,</span> <span class="n">_create_py_udf</span> <span class="c1"># noqa: F401</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.udtf</span> <span class="kn">import</span> <span class="n">UserDefinedTableFunction</span><span class="p">,</span> <span class="n">_create_py_udtf</span> |
| |
| <span class="c1"># Keep pandas_udf and PandasUDFType import for backwards compatible import; moved in SPARK-28264</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.pandas.functions</span> <span class="kn">import</span> <span class="n">pandas_udf</span><span class="p">,</span> <span class="n">PandasUDFType</span> <span class="c1"># noqa: F401</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.utils</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">to_str</span><span class="p">,</span> |
| <span class="n">has_numpy</span><span class="p">,</span> |
| <span class="n">try_remote_functions</span><span class="p">,</span> |
| <span class="n">get_active_spark_context</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql._typing</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">ColumnOrName</span><span class="p">,</span> |
| <span class="n">ColumnOrName_</span><span class="p">,</span> |
| <span class="n">DataTypeOrString</span><span class="p">,</span> |
| <span class="n">UserDefinedFunctionLike</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">has_numpy</span><span class="p">:</span> |
| <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span> |
| |
| <span class="c1"># Note to developers: all of PySpark functions here take string as column names whenever possible.</span> |
| <span class="c1"># Namely, if columns are referred as arguments, they can always be both Column or string,</span> |
| <span class="c1"># even though there might be few exceptions for legacy or inevitable reasons.</span> |
| <span class="c1"># If you are fixing other language APIs together, also please note that Scala side is not the case</span> |
| <span class="c1"># since it requires making every single overridden definition.</span> |
| |
| |
| <span class="k">def</span> <span class="nf">_get_jvm_function</span><span class="p">(</span><span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">sc</span><span class="p">:</span> <span class="n">SparkContext</span><span class="p">)</span> <span class="o">-></span> <span class="n">Callable</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Retrieves JVM function identified by name from</span> |
| <span class="sd"> Java gateway associated with sc.</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| <span class="k">return</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">functions</span><span class="p">,</span> <span class="n">name</span><span class="p">)</span> |
| |
| |
| <span class="k">def</span> <span class="nf">_invoke_function</span><span class="p">(</span><span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Invokes JVM function identified by name with args</span> |
| <span class="sd"> and wraps the result with :class:`~pyspark.sql.Column`.</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| <span class="n">jf</span> <span class="o">=</span> <span class="n">_get_jvm_function</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">jf</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">))</span> |
| |
| |
| <span class="k">def</span> <span class="nf">_invoke_function_over_columns</span><span class="p">(</span><span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Invokes n-ary JVM function identified by name</span> |
| <span class="sd"> and wraps the result with :class:`~pyspark.sql.Column`.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="o">*</span><span class="p">(</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">cols</span><span class="p">))</span> |
| |
| |
| <span class="k">def</span> <span class="nf">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">cols</span><span class="p">:</span> <span class="s2">"Iterable[ColumnOrName]"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Invokes unary JVM function identified by name with</span> |
| <span class="sd"> and wraps the result with :class:`~pyspark.sql.Column`.</span> |
| <span class="sd"> """</span> |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span> |
| |
| |
| <span class="k">def</span> <span class="nf">_invoke_binary_math_function</span><span class="p">(</span><span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">col1</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Invokes binary JVM math function identified by name</span> |
| <span class="sd"> and wraps the result with :class:`~pyspark.sql.Column`.</span> |
| <span class="sd"> """</span> |
| |
| <span class="c1"># For legacy reasons, the arguments here can be implicitly converted into column</span> |
| <span class="n">cols</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">_to_java_column</span><span class="p">(</span><span class="n">c</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">c</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">))</span> <span class="k">else</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">c</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="p">(</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">)</span> |
| |
| |
| <span class="k">def</span> <span class="nf">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]]:</span> |
| <span class="k">if</span> <span class="n">options</span><span class="p">:</span> |
| <span class="k">return</span> <span class="p">{</span><span class="n">key</span><span class="p">:</span> <span class="n">to_str</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> <span class="k">for</span> <span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span> <span class="ow">in</span> <span class="n">options</span><span class="o">.</span><span class="n">items</span><span class="p">()}</span> |
| <span class="k">return</span> <span class="p">{}</span> |
| |
| |
| <div class="viewcode-block" id="lit"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.lit.html#pyspark.sql.functions.lit">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">lit</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Creates a :class:`~pyspark.sql.Column` of literal value.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column`, str, int, float, bool or list, NumPy literals or ndarray.</span> |
| <span class="sd"> the value to make it as a PySpark literal. If a column is passed,</span> |
| <span class="sd"> it returns the column as is.</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Since 3.4.0, it supports the list type.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the literal instance.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(lit(5).alias('height'), df.id).show()</span> |
| <span class="sd"> +------+---+</span> |
| <span class="sd"> |height| id|</span> |
| <span class="sd"> +------+---+</span> |
| <span class="sd"> | 5| 0|</span> |
| <span class="sd"> +------+---+</span> |
| |
| <span class="sd"> Create a literal from a list.</span> |
| |
| <span class="sd"> >>> spark.range(1).select(lit([1, 2, 3])).show()</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> |array(1, 2, 3)|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> | [1, 2, 3]|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">col</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">c</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">col</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"COLUMN_IN_LIST"</span><span class="p">,</span> <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"func_name"</span><span class="p">:</span> <span class="s2">"lit"</span><span class="p">}</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">array</span><span class="p">(</span><span class="o">*</span><span class="p">[</span><span class="n">lit</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> <span class="k">for</span> <span class="n">item</span> <span class="ow">in</span> <span class="n">col</span><span class="p">])</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">has_numpy</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">generic</span><span class="p">):</span> |
| <span class="n">dt</span> <span class="o">=</span> <span class="n">_from_numpy_type</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">dt</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"lit"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="n">dt</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="nb">str</span><span class="p">(</span><span class="n">col</span><span class="p">))</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"lit"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="col"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.col.html#pyspark.sql.functions.col">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">col</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a :class:`~pyspark.sql.Column` based on the given column name.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : str</span> |
| <span class="sd"> the name for the column</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the corresponding column instance.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> col('x')</span> |
| <span class="sd"> Column<'x'></span> |
| <span class="sd"> >>> column('x')</span> |
| <span class="sd"> Column<'x'></span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"col"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <span class="n">column</span> <span class="o">=</span> <span class="n">col</span> |
| |
| |
| <div class="viewcode-block" id="asc"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.asc.html#pyspark.sql.functions.asc">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">asc</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a sort expression based on the ascending order of the given column name.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to sort by in the ascending order.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column specifying the order.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Sort by the column 'id' in the descending order.</span> |
| |
| <span class="sd"> >>> df = spark.range(5)</span> |
| <span class="sd"> >>> df = df.sort(desc("id"))</span> |
| <span class="sd"> >>> df.show()</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | id|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | 4|</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> +---+</span> |
| |
| <span class="sd"> Sort by the column 'id' in the ascending order.</span> |
| |
| <span class="sd"> >>> df.orderBy(asc("id")).show()</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | id|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> | 4|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">col</span><span class="o">.</span><span class="n">asc</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> <span class="k">else</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"asc"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="desc"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.desc.html#pyspark.sql.functions.desc">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">desc</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a sort expression based on the descending order of the given column name.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to sort by in the descending order.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column specifying the order.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Sort by the column 'id' in the descending order.</span> |
| |
| <span class="sd"> >>> spark.range(5).orderBy(desc("id")).show()</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | id|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | 4|</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">col</span><span class="o">.</span><span class="n">desc</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> <span class="k">else</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"desc"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="sqrt"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sqrt.html#pyspark.sql.functions.sqrt">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">sqrt</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes the square root of the specified float value.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(sqrt(lit(4))).show()</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> |SQRT(4)|</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> | 2.0|</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"sqrt"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="try_add"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_add.html#pyspark.sql.functions.try_add">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">try_add</span><span class="p">(</span><span class="n">left</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">right</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the sum of `left`and `right` and the result is null on overflow.</span> |
| <span class="sd"> The acceptable input types are the same with the `+` operator.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> left : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> right : :class:`~pyspark.sql.Column` or str</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1982, 15), (1990, 2)], ["birth", "age"])</span> |
| <span class="sd"> >>> df.select(try_add(df.birth, df.age).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=1997), Row(r=1992)]</span> |
| |
| <span class="sd"> >>> from pyspark.sql.types import StructType, StructField, IntegerType, StringType</span> |
| <span class="sd"> >>> schema = StructType([</span> |
| <span class="sd"> ... StructField("i", IntegerType(), True),</span> |
| <span class="sd"> ... StructField("d", StringType(), True),</span> |
| <span class="sd"> ... ])</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, '2015-09-30')], schema)</span> |
| <span class="sd"> >>> df = df.select(df.i, to_date(df.d).alias('d'))</span> |
| <span class="sd"> >>> df.select(try_add(df.d, df.i).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=datetime.date(2015, 10, 1))]</span> |
| |
| <span class="sd"> >>> df.select(try_add(df.d, make_interval(df.i)).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=datetime.date(2016, 9, 30))]</span> |
| |
| <span class="sd"> >>> df.select(</span> |
| <span class="sd"> ... try_add(df.d, make_interval(lit(0), lit(0), lit(0), df.i)).alias('r')</span> |
| <span class="sd"> ... ).collect()</span> |
| <span class="sd"> [Row(r=datetime.date(2015, 10, 1))]</span> |
| |
| <span class="sd"> >>> df.select(</span> |
| <span class="sd"> ... try_add(make_interval(df.i), make_interval(df.i)).alias('r')</span> |
| <span class="sd"> ... ).show(truncate=False)</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> |2 years|</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"try_add"</span><span class="p">,</span> <span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="try_avg"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_avg.html#pyspark.sql.functions.try_avg">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">try_avg</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the mean calculated from values of a group and the result is null on overflow.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [(1982, 15), (1990, 2)], ["birth", "age"]</span> |
| <span class="sd"> ... ).select(sf.try_avg("age")).show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |try_avg(age)|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | 8.5|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"try_avg"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="try_divide"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_divide.html#pyspark.sql.functions.try_divide">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">try_divide</span><span class="p">(</span><span class="n">left</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">right</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns `dividend`/`divisor`. It always performs floating point division. Its result is</span> |
| <span class="sd"> always null if `divisor` is 0.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> left : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> dividend</span> |
| <span class="sd"> right : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> divisor</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(6000, 15), (1990, 2)], ["a", "b"])</span> |
| <span class="sd"> >>> df.select(try_divide(df.a, df.b).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=400.0), Row(r=995.0)]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(1, 2)], ["year", "month"])</span> |
| <span class="sd"> >>> df.select(</span> |
| <span class="sd"> ... try_divide(make_interval(df.year), df.month).alias('r')</span> |
| <span class="sd"> ... ).show(truncate=False)</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> |6 months|</span> |
| <span class="sd"> +--------+</span> |
| |
| <span class="sd"> >>> df.select(</span> |
| <span class="sd"> ... try_divide(make_interval(df.year, df.month), lit(2)).alias('r')</span> |
| <span class="sd"> ... ).show(truncate=False)</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> |7 months|</span> |
| <span class="sd"> +--------+</span> |
| |
| <span class="sd"> >>> df.select(</span> |
| <span class="sd"> ... try_divide(make_interval(df.year, df.month), lit(0)).alias('r')</span> |
| <span class="sd"> ... ).show(truncate=False)</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> |NULL|</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"try_divide"</span><span class="p">,</span> <span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="try_multiply"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_multiply.html#pyspark.sql.functions.try_multiply">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">try_multiply</span><span class="p">(</span><span class="n">left</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">right</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns `left`*`right` and the result is null on overflow. The acceptable input types are the</span> |
| <span class="sd"> same with the `*` operator.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> left : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> multiplicand</span> |
| <span class="sd"> right : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> multiplier</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(6000, 15), (1990, 2)], ["a", "b"])</span> |
| <span class="sd"> >>> df.select(try_multiply(df.a, df.b).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=90000), Row(r=3980)]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(2, 3),], ["a", "b"])</span> |
| <span class="sd"> >>> df.select(try_multiply(make_interval(df.a), df.b).alias('r')).show(truncate=False)</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> |6 years|</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"try_multiply"</span><span class="p">,</span> <span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="try_subtract"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_subtract.html#pyspark.sql.functions.try_subtract">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">try_subtract</span><span class="p">(</span><span class="n">left</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">right</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns `left`-`right` and the result is null on overflow. The acceptable input types are the</span> |
| <span class="sd"> same with the `-` operator.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> left : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> right : :class:`~pyspark.sql.Column` or str</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(6000, 15), (1990, 2)], ["a", "b"])</span> |
| <span class="sd"> >>> df.select(try_subtract(df.a, df.b).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=5985), Row(r=1988)]</span> |
| |
| <span class="sd"> >>> from pyspark.sql.types import StructType, StructField, IntegerType, StringType</span> |
| <span class="sd"> >>> schema = StructType([</span> |
| <span class="sd"> ... StructField("i", IntegerType(), True),</span> |
| <span class="sd"> ... StructField("d", StringType(), True),</span> |
| <span class="sd"> ... ])</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, '2015-09-30')], schema)</span> |
| <span class="sd"> >>> df = df.select(df.i, to_date(df.d).alias('d'))</span> |
| <span class="sd"> >>> df.select(try_subtract(df.d, df.i).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=datetime.date(2015, 9, 29))]</span> |
| |
| <span class="sd"> >>> df.select(try_subtract(df.d, make_interval(df.i)).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=datetime.date(2014, 9, 30))]</span> |
| |
| <span class="sd"> >>> df.select(</span> |
| <span class="sd"> ... try_subtract(df.d, make_interval(lit(0), lit(0), lit(0), df.i)).alias('r')</span> |
| <span class="sd"> ... ).collect()</span> |
| <span class="sd"> [Row(r=datetime.date(2015, 9, 29))]</span> |
| |
| <span class="sd"> >>> df.select(</span> |
| <span class="sd"> ... try_subtract(make_interval(df.i), make_interval(df.i)).alias('r')</span> |
| <span class="sd"> ... ).show(truncate=False)</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> |0 seconds|</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"try_subtract"</span><span class="p">,</span> <span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="try_sum"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_sum.html#pyspark.sql.functions.try_sum">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">try_sum</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the sum calculated from values of a group and the result is null on overflow.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.range(10).select(sf.try_sum("id")).show()</span> |
| <span class="sd"> +-----------+</span> |
| <span class="sd"> |try_sum(id)|</span> |
| <span class="sd"> +-----------+</span> |
| <span class="sd"> | 45|</span> |
| <span class="sd"> +-----------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"try_sum"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="abs"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.abs.html#pyspark.sql.functions.abs">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">abs</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes the absolute value.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(abs(lit(-1))).show()</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> |abs(-1)|</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"abs"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="mode"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.mode.html#pyspark.sql.functions.mode">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">mode</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the most frequent value in a group.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the most frequent value in a group.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... ("Java", 2012, 20000), ("dotNET", 2012, 5000),</span> |
| <span class="sd"> ... ("Java", 2012, 20000), ("dotNET", 2012, 5000),</span> |
| <span class="sd"> ... ("dotNET", 2013, 48000), ("Java", 2013, 30000)],</span> |
| <span class="sd"> ... schema=("course", "year", "earnings"))</span> |
| <span class="sd"> >>> df.groupby("course").agg(mode("year")).show()</span> |
| <span class="sd"> +------+----------+</span> |
| <span class="sd"> |course|mode(year)|</span> |
| <span class="sd"> +------+----------+</span> |
| <span class="sd"> | Java| 2012|</span> |
| <span class="sd"> |dotNET| 2012|</span> |
| <span class="sd"> +------+----------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"mode"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="max"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.max.html#pyspark.sql.functions.max">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">max</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the maximum value of the expression in a group.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(10)</span> |
| <span class="sd"> >>> df.select(max(col("id"))).show()</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> |max(id)|</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> | 9|</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"max"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="min"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.min.html#pyspark.sql.functions.min">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">min</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the minimum value of the expression in a group.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(10)</span> |
| <span class="sd"> >>> df.select(min(df.id)).show()</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> |min(id)|</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"min"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="max_by"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.max_by.html#pyspark.sql.functions.max_by">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">max_by</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">ord</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the value associated with the maximum value of ord.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| <span class="sd"> ord : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column to be maximized</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> value associated with the maximum value of ord.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... ("Java", 2012, 20000), ("dotNET", 2012, 5000),</span> |
| <span class="sd"> ... ("dotNET", 2013, 48000), ("Java", 2013, 30000)],</span> |
| <span class="sd"> ... schema=("course", "year", "earnings"))</span> |
| <span class="sd"> >>> df.groupby("course").agg(max_by("year", "earnings")).show()</span> |
| <span class="sd"> +------+----------------------+</span> |
| <span class="sd"> |course|max_by(year, earnings)|</span> |
| <span class="sd"> +------+----------------------+</span> |
| <span class="sd"> | Java| 2013|</span> |
| <span class="sd"> |dotNET| 2013|</span> |
| <span class="sd"> +------+----------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"max_by"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="nb">ord</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="min_by"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.min_by.html#pyspark.sql.functions.min_by">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">min_by</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">ord</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the value associated with the minimum value of ord.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| <span class="sd"> ord : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column to be minimized</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> value associated with the minimum value of ord.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... ("Java", 2012, 20000), ("dotNET", 2012, 5000),</span> |
| <span class="sd"> ... ("dotNET", 2013, 48000), ("Java", 2013, 30000)],</span> |
| <span class="sd"> ... schema=("course", "year", "earnings"))</span> |
| <span class="sd"> >>> df.groupby("course").agg(min_by("year", "earnings")).show()</span> |
| <span class="sd"> +------+----------------------+</span> |
| <span class="sd"> |course|min_by(year, earnings)|</span> |
| <span class="sd"> +------+----------------------+</span> |
| <span class="sd"> | Java| 2012|</span> |
| <span class="sd"> |dotNET| 2012|</span> |
| <span class="sd"> +------+----------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"min_by"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="nb">ord</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="count"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.count.html#pyspark.sql.functions.count">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">count</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the number of items in a group.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Count by all columns (start), and by a column that does not count ``None``.</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(None,), ("a",), ("b",), ("c",)], schema=["alphabets"])</span> |
| <span class="sd"> >>> df.select(count(expr("*")), count(df.alphabets)).show()</span> |
| <span class="sd"> +--------+----------------+</span> |
| <span class="sd"> |count(1)|count(alphabets)|</span> |
| <span class="sd"> +--------+----------------+</span> |
| <span class="sd"> | 4| 3|</span> |
| <span class="sd"> +--------+----------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"count"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="sum"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sum.html#pyspark.sql.functions.sum">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">sum</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the sum of all values in the expression.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(10)</span> |
| <span class="sd"> >>> df.select(sum(df["id"])).show()</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> |sum(id)|</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> | 45|</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"sum"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="avg"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.avg.html#pyspark.sql.functions.avg">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">avg</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the average of the values in a group.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(10)</span> |
| <span class="sd"> >>> df.select(avg(col("id"))).show()</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> |avg(id)|</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> | 4.5|</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"avg"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="mean"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.mean.html#pyspark.sql.functions.mean">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">mean</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the average of the values in a group.</span> |
| <span class="sd"> An alias of :func:`avg`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(10)</span> |
| <span class="sd"> >>> df.select(mean(df.id)).show()</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> |avg(id)|</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> | 4.5|</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"mean"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="median"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.median.html#pyspark.sql.functions.median">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">median</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the median of the values in a group.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the median of the values in a group.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... ("Java", 2012, 20000), ("dotNET", 2012, 5000),</span> |
| <span class="sd"> ... ("Java", 2012, 22000), ("dotNET", 2012, 10000),</span> |
| <span class="sd"> ... ("dotNET", 2013, 48000), ("Java", 2013, 30000)],</span> |
| <span class="sd"> ... schema=("course", "year", "earnings"))</span> |
| <span class="sd"> >>> df.groupby("course").agg(median("earnings")).show()</span> |
| <span class="sd"> +------+----------------+</span> |
| <span class="sd"> |course|median(earnings)|</span> |
| <span class="sd"> +------+----------------+</span> |
| <span class="sd"> | Java| 22000.0|</span> |
| <span class="sd"> |dotNET| 10000.0|</span> |
| <span class="sd"> +------+----------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"median"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="sumDistinct"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sumDistinct.html#pyspark.sql.functions.sumDistinct">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">sumDistinct</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the sum of distinct values in the expression.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> .. deprecated:: 3.2.0</span> |
| <span class="sd"> Use :func:`sum_distinct` instead.</span> |
| <span class="sd"> """</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">"Deprecated in 3.2, use sum_distinct instead."</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">sum_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="sum_distinct"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sum_distinct.html#pyspark.sql.functions.sum_distinct">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">sum_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the sum of distinct values in the expression.</span> |
| |
| <span class="sd"> .. versionadded:: 3.2.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(None,), (1,), (1,), (2,)], schema=["numbers"])</span> |
| <span class="sd"> >>> df.select(sum_distinct(col("numbers"))).show()</span> |
| <span class="sd"> +---------------------+</span> |
| <span class="sd"> |sum(DISTINCT numbers)|</span> |
| <span class="sd"> +---------------------+</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> +---------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"sum_distinct"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="product"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.product.html#pyspark.sql.functions.product">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">product</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the product of the values in a group.</span> |
| |
| <span class="sd"> .. versionadded:: 3.2.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : str, :class:`Column`</span> |
| <span class="sd"> column containing values to be multiplied together</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1, 10).toDF('x').withColumn('mod3', col('x') % 3)</span> |
| <span class="sd"> >>> prods = df.groupBy('mod3').agg(product('x').alias('product'))</span> |
| <span class="sd"> >>> prods.orderBy('mod3').show()</span> |
| <span class="sd"> +----+-------+</span> |
| <span class="sd"> |mod3|product|</span> |
| <span class="sd"> +----+-------+</span> |
| <span class="sd"> | 0| 162.0|</span> |
| <span class="sd"> | 1| 28.0|</span> |
| <span class="sd"> | 2| 80.0|</span> |
| <span class="sd"> +----+-------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"product"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="acos"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.acos.html#pyspark.sql.functions.acos">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">acos</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes inverse cosine of the input column.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> inverse cosine of `col`, as if computed by `java.lang.Math.acos()`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1, 3)</span> |
| <span class="sd"> >>> df.select(acos(df.id)).show()</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> |ACOS(id)|</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> | 0.0|</span> |
| <span class="sd"> | NaN|</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"acos"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="acosh"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.acosh.html#pyspark.sql.functions.acosh">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">acosh</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes inverse hyperbolic cosine of the input column.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(2)</span> |
| <span class="sd"> >>> df.select(acosh(col("id"))).show()</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> |ACOSH(id)|</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> | NaN|</span> |
| <span class="sd"> | 0.0|</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"acosh"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="asin"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.asin.html#pyspark.sql.functions.asin">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">asin</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes inverse sine of the input column.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> inverse sine of `col`, as if computed by `java.lang.Math.asin()`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(0,), (2,)])</span> |
| <span class="sd"> >>> df.select(asin(df.schema.fieldNames()[0])).show()</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> |ASIN(_1)|</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> | 0.0|</span> |
| <span class="sd"> | NaN|</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"asin"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="asinh"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.asinh.html#pyspark.sql.functions.asinh">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">asinh</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes inverse hyperbolic sine of the input column.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(asinh(col("id"))).show()</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> |ASINH(id)|</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> | 0.0|</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"asinh"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="atan"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.atan.html#pyspark.sql.functions.atan">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">atan</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Compute inverse tangent of the input column.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> inverse tangent of `col`, as if computed by `java.lang.Math.atan()`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(atan(df.id)).show()</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> |ATAN(id)|</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> | 0.0|</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"atan"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="atanh"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.atanh.html#pyspark.sql.functions.atanh">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">atanh</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes inverse hyperbolic tangent of the input column.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(0,), (2,)], schema=["numbers"])</span> |
| <span class="sd"> >>> df.select(atanh(df["numbers"])).show()</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> |ATANH(numbers)|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> | 0.0|</span> |
| <span class="sd"> | NaN|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"atanh"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="cbrt"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.cbrt.html#pyspark.sql.functions.cbrt">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">cbrt</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes the cube-root of the given value.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(cbrt(lit(27))).show()</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> |CBRT(27)|</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> | 3.0|</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"cbrt"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="ceil"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.ceil.html#pyspark.sql.functions.ceil">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">ceil</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes the ceiling of the given value.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(ceil(lit(-0.1))).show()</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> |CEIL(-0.1)|</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"ceil"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="ceiling"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.ceiling.html#pyspark.sql.functions.ceiling">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">ceiling</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes the ceiling of the given value.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.range(1).select(sf.ceil(sf.lit(-0.1))).show()</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> |CEIL(-0.1)|</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"ceiling"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="cos"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.cos.html#pyspark.sql.functions.cos">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">cos</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes cosine of the input column.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> angle in radians</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> cosine of the angle, as if computed by `java.lang.Math.cos()`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import math</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(cos(lit(math.pi))).first()</span> |
| <span class="sd"> Row(COS(3.14159...)=-1.0)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"cos"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="cosh"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.cosh.html#pyspark.sql.functions.cosh">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">cosh</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes hyperbolic cosine of the input column.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> hyperbolic angle</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> hyperbolic cosine of the angle, as if computed by `java.lang.Math.cosh()`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(cosh(lit(1))).first()</span> |
| <span class="sd"> Row(COSH(1)=1.54308...)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"cosh"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="cot"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.cot.html#pyspark.sql.functions.cot">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">cot</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes cotangent of the input column.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> angle in radians.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> cotangent of the angle.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import math</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(cot(lit(math.radians(45)))).first()</span> |
| <span class="sd"> Row(COT(0.78539...)=1.00000...)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"cot"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="csc"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.csc.html#pyspark.sql.functions.csc">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">csc</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes cosecant of the input column.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> angle in radians.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> cosecant of the angle.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import math</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(csc(lit(math.radians(90)))).first()</span> |
| <span class="sd"> Row(CSC(1.57079...)=1.0)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"csc"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="e"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.e.html#pyspark.sql.functions.e">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">e</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns Euler's number.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.range(1).select(e()).show()</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> | E()|</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> |2.718281828459045|</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"e"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="exp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.exp.html#pyspark.sql.functions.exp">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">exp</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes the exponential of the given value.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column to calculate exponential for.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> exponential of the given value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(exp(lit(0))).show()</span> |
| <span class="sd"> +------+</span> |
| <span class="sd"> |EXP(0)|</span> |
| <span class="sd"> +------+</span> |
| <span class="sd"> | 1.0|</span> |
| <span class="sd"> +------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"exp"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="expm1"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.expm1.html#pyspark.sql.functions.expm1">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">expm1</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes the exponential of the given value minus one.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column to calculate exponential for.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> exponential less one.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(expm1(lit(1))).first()</span> |
| <span class="sd"> Row(EXPM1(1)=1.71828...)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"expm1"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="floor"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.floor.html#pyspark.sql.functions.floor">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">floor</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes the floor of the given value.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column to find floor for.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> nearest integer that is less than or equal to given value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(floor(lit(2.5))).show()</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> |FLOOR(2.5)|</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"floor"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">log</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes the natural logarithm of the given value.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column to calculate natural logarithm for.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> natural logarithm of the given value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import math</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(log(lit(math.e))).first()</span> |
| <span class="sd"> Row(ln(2.71828...)=1.0)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"log"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="log10"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.log10.html#pyspark.sql.functions.log10">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">log10</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes the logarithm of the given value in Base 10.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column to calculate logarithm for.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> logarithm of the given value in Base 10.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(log10(lit(100))).show()</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> |LOG10(100)|</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> | 2.0|</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"log10"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="log1p"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.log1p.html#pyspark.sql.functions.log1p">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">log1p</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes the natural logarithm of the "given value plus one".</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column to calculate natural logarithm for.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> natural logarithm of the "given value plus one".</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import math</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(log1p(lit(math.e))).first()</span> |
| <span class="sd"> Row(LOG1P(2.71828...)=1.31326...)</span> |
| |
| <span class="sd"> Same as:</span> |
| |
| <span class="sd"> >>> df.select(log(lit(math.e+1))).first()</span> |
| <span class="sd"> Row(ln(3.71828...)=1.31326...)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"log1p"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="negative"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.negative.html#pyspark.sql.functions.negative">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">negative</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the negative value.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column to calculate negative value for.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> negative value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.range(3).select(sf.negative("id")).show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |negative(id)|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> | -1|</span> |
| <span class="sd"> | -2|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"negative"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <span class="n">negate</span> <span class="o">=</span> <span class="n">negative</span> |
| |
| |
| <div class="viewcode-block" id="pi"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.pi.html#pyspark.sql.functions.pi">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">pi</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns Pi.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.range(1).select(pi()).show()</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> | PI()|</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> |3.141592653589793|</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"pi"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="positive"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.positive.html#pyspark.sql.functions.positive">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">positive</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the value.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> input value column.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(-1,), (0,), (1,)], ['v'])</span> |
| <span class="sd"> >>> df.select(positive("v").alias("p")).show()</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | p|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | -1|</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"positive"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="rint"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.rint.html#pyspark.sql.functions.rint">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">rint</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the double value that is closest in value to the argument and</span> |
| <span class="sd"> is equal to a mathematical integer.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(rint(lit(10.6))).show()</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> |rint(10.6)|</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> | 11.0|</span> |
| <span class="sd"> +----------+</span> |
| |
| <span class="sd"> >>> df.select(rint(lit(10.3))).show()</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> |rint(10.3)|</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> | 10.0|</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"rint"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="sec"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sec.html#pyspark.sql.functions.sec">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">sec</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes secant of the input column.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Angle in radians</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> Secant of the angle.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(sec(lit(1.5))).first()</span> |
| <span class="sd"> Row(SEC(1.5)=14.13683...)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"sec"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="signum"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.signum.html#pyspark.sql.functions.signum">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">signum</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes the signum of the given value.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.range(1).select(</span> |
| <span class="sd"> ... sf.signum(sf.lit(-5)),</span> |
| <span class="sd"> ... sf.signum(sf.lit(6))</span> |
| <span class="sd"> ... ).show()</span> |
| <span class="sd"> +----------+---------+</span> |
| <span class="sd"> |SIGNUM(-5)|SIGNUM(6)|</span> |
| <span class="sd"> +----------+---------+</span> |
| <span class="sd"> | -1.0| 1.0|</span> |
| <span class="sd"> +----------+---------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"signum"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="sign"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sign.html#pyspark.sql.functions.sign">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">sign</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes the signum of the given value.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.range(1).select(</span> |
| <span class="sd"> ... sf.sign(sf.lit(-5)),</span> |
| <span class="sd"> ... sf.sign(sf.lit(6))</span> |
| <span class="sd"> ... ).show()</span> |
| <span class="sd"> +--------+-------+</span> |
| <span class="sd"> |sign(-5)|sign(6)|</span> |
| <span class="sd"> +--------+-------+</span> |
| <span class="sd"> | -1.0| 1.0|</span> |
| <span class="sd"> +--------+-------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"sign"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="sin"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sin.html#pyspark.sql.functions.sin">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">sin</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes sine of the input column.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> sine of the angle, as if computed by `java.lang.Math.sin()`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import math</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(sin(lit(math.radians(90)))).first()</span> |
| <span class="sd"> Row(SIN(1.57079...)=1.0)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"sin"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="sinh"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sinh.html#pyspark.sql.functions.sinh">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">sinh</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes hyperbolic sine of the input column.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> hyperbolic angle.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> hyperbolic sine of the given value,</span> |
| <span class="sd"> as if computed by `java.lang.Math.sinh()`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(sinh(lit(1.1))).first()</span> |
| <span class="sd"> Row(SINH(1.1)=1.33564...)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"sinh"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="tan"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.tan.html#pyspark.sql.functions.tan">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">tan</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes tangent of the input column.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> angle in radians</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> tangent of the given value, as if computed by `java.lang.Math.tan()`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import math</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(tan(lit(math.radians(45)))).first()</span> |
| <span class="sd"> Row(TAN(0.78539...)=0.99999...)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"tan"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="tanh"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.tanh.html#pyspark.sql.functions.tanh">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">tanh</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes hyperbolic tangent of the input column.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> hyperbolic angle</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> hyperbolic tangent of the given value</span> |
| <span class="sd"> as if computed by `java.lang.Math.tanh()`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import math</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(tanh(lit(math.radians(90)))).first()</span> |
| <span class="sd"> Row(TANH(1.57079...)=0.91715...)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"tanh"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="toDegrees"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.toDegrees.html#pyspark.sql.functions.toDegrees">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">toDegrees</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> .. deprecated:: 2.1.0</span> |
| <span class="sd"> Use :func:`degrees` instead.</span> |
| <span class="sd"> """</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">"Deprecated in 2.1, use degrees instead."</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">degrees</span><span class="p">(</span><span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="toRadians"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.toRadians.html#pyspark.sql.functions.toRadians">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">toRadians</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> .. deprecated:: 2.1.0</span> |
| <span class="sd"> Use :func:`radians` instead.</span> |
| <span class="sd"> """</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">"Deprecated in 2.1, use radians instead."</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">radians</span><span class="p">(</span><span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="bitwiseNOT"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bitwiseNOT.html#pyspark.sql.functions.bitwiseNOT">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">bitwiseNOT</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes bitwise not.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> .. deprecated:: 3.2.0</span> |
| <span class="sd"> Use :func:`bitwise_not` instead.</span> |
| <span class="sd"> """</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">"Deprecated in 3.2, use bitwise_not instead."</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">bitwise_not</span><span class="p">(</span><span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="bitwise_not"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bitwise_not.html#pyspark.sql.functions.bitwise_not">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">bitwise_not</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes bitwise not.</span> |
| |
| <span class="sd"> .. versionadded:: 3.2.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(bitwise_not(lit(0))).show()</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | ~0|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | -1|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> >>> df.select(bitwise_not(lit(1))).show()</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | ~1|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | -2|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"bitwise_not"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="bit_count"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bit_count.html#pyspark.sql.functions.bit_count">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">bit_count</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the number of bits that are set in the argument expr as an unsigned 64-bit integer,</span> |
| <span class="sd"> or NULL if the argument is NULL.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the number of bits that are set in the argument expr as an unsigned 64-bit integer,</span> |
| <span class="sd"> or NULL if the argument is NULL.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[1],[1],[2]], ["c"])</span> |
| <span class="sd"> >>> df.select(bit_count("c")).show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |bit_count(c)|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"bit_count"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="bit_get"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bit_get.html#pyspark.sql.functions.bit_get">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">bit_get</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">pos</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the value of the bit (0 or 1) at the specified position.</span> |
| <span class="sd"> The positions are numbered from right to left, starting at zero.</span> |
| <span class="sd"> The position argument cannot be negative.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| <span class="sd"> pos : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The positions are numbered from right to left, starting at zero.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the value of the bit (0 or 1) at the specified position.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[1],[1],[2]], ["c"])</span> |
| <span class="sd"> >>> df.select(bit_get("c", lit(1))).show()</span> |
| <span class="sd"> +-------------+</span> |
| <span class="sd"> |bit_get(c, 1)|</span> |
| <span class="sd"> +-------------+</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> +-------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"bit_get"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">pos</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="getbit"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.getbit.html#pyspark.sql.functions.getbit">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">getbit</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">pos</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the value of the bit (0 or 1) at the specified position.</span> |
| <span class="sd"> The positions are numbered from right to left, starting at zero.</span> |
| <span class="sd"> The position argument cannot be negative.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| <span class="sd"> pos : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The positions are numbered from right to left, starting at zero.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the value of the bit (0 or 1) at the specified position.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [[1], [1], [2]], ["c"]</span> |
| <span class="sd"> ... ).select(sf.getbit("c", sf.lit(1))).show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |getbit(c, 1)|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"getbit"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">pos</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="asc_nulls_first"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.asc_nulls_first.html#pyspark.sql.functions.asc_nulls_first">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">asc_nulls_first</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a sort expression based on the ascending order of the given</span> |
| <span class="sd"> column name, and null values return before non-null values.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to sort by in the ascending order.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column specifying the order.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df1 = spark.createDataFrame([(1, "Bob"),</span> |
| <span class="sd"> ... (0, None),</span> |
| <span class="sd"> ... (2, "Alice")], ["age", "name"])</span> |
| <span class="sd"> >>> df1.sort(asc_nulls_first(df1.name)).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 0| NULL|</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 1| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="p">(</span> |
| <span class="n">col</span><span class="o">.</span><span class="n">asc_nulls_first</span><span class="p">()</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> |
| <span class="k">else</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"asc_nulls_first"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> |
| <span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="asc_nulls_last"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.asc_nulls_last.html#pyspark.sql.functions.asc_nulls_last">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">asc_nulls_last</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a sort expression based on the ascending order of the given</span> |
| <span class="sd"> column name, and null values appear after non-null values.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to sort by in the ascending order.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column specifying the order.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df1 = spark.createDataFrame([(0, None),</span> |
| <span class="sd"> ... (1, "Bob"),</span> |
| <span class="sd"> ... (2, "Alice")], ["age", "name"])</span> |
| <span class="sd"> >>> df1.sort(asc_nulls_last(df1.name)).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 1| Bob|</span> |
| <span class="sd"> | 0| NULL|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="p">(</span> |
| <span class="n">col</span><span class="o">.</span><span class="n">asc_nulls_last</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> <span class="k">else</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"asc_nulls_last"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> |
| <span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="desc_nulls_first"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.desc_nulls_first.html#pyspark.sql.functions.desc_nulls_first">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">desc_nulls_first</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a sort expression based on the descending order of the given</span> |
| <span class="sd"> column name, and null values appear before non-null values.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to sort by in the descending order.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column specifying the order.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df1 = spark.createDataFrame([(0, None),</span> |
| <span class="sd"> ... (1, "Bob"),</span> |
| <span class="sd"> ... (2, "Alice")], ["age", "name"])</span> |
| <span class="sd"> >>> df1.sort(desc_nulls_first(df1.name)).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 0| NULL|</span> |
| <span class="sd"> | 1| Bob|</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="p">(</span> |
| <span class="n">col</span><span class="o">.</span><span class="n">desc_nulls_first</span><span class="p">()</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> |
| <span class="k">else</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"desc_nulls_first"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> |
| <span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="desc_nulls_last"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.desc_nulls_last.html#pyspark.sql.functions.desc_nulls_last">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">desc_nulls_last</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a sort expression based on the descending order of the given</span> |
| <span class="sd"> column name, and null values appear after non-null values.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to sort by in the descending order.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column specifying the order.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df1 = spark.createDataFrame([(0, None),</span> |
| <span class="sd"> ... (1, "Bob"),</span> |
| <span class="sd"> ... (2, "Alice")], ["age", "name"])</span> |
| <span class="sd"> >>> df1.sort(desc_nulls_last(df1.name)).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 1| Bob|</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 0| NULL|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="p">(</span> |
| <span class="n">col</span><span class="o">.</span><span class="n">desc_nulls_last</span><span class="p">()</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> |
| <span class="k">else</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"desc_nulls_last"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> |
| <span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="stddev"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.stddev.html#pyspark.sql.functions.stddev">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">stddev</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: alias for stddev_samp.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> standard deviation of given column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.range(6).select(sf.stddev("id")).show()</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> | stddev(id)|</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> |1.8708286933869...|</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"stddev"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="std"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.std.html#pyspark.sql.functions.std">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">std</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: alias for stddev_samp.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> standard deviation of given column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.range(6).select(sf.std("id")).show()</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> | std(id)|</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> |1.8708286933869...|</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"std"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="stddev_samp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.stddev_samp.html#pyspark.sql.functions.stddev_samp">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">stddev_samp</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the unbiased sample standard deviation of</span> |
| <span class="sd"> the expression in a group.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> standard deviation of given column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.range(6).select(sf.stddev_samp("id")).show()</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> | stddev_samp(id)|</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> |1.8708286933869...|</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"stddev_samp"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="stddev_pop"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.stddev_pop.html#pyspark.sql.functions.stddev_pop">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">stddev_pop</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns population standard deviation of</span> |
| <span class="sd"> the expression in a group.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> standard deviation of given column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.range(6).select(sf.stddev_pop("id")).show()</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> | stddev_pop(id)|</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> |1.707825127659...|</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"stddev_pop"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="variance"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.variance.html#pyspark.sql.functions.variance">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">variance</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: alias for var_samp</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> variance of given column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(6)</span> |
| <span class="sd"> >>> df.select(variance(df.id)).show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |var_samp(id)|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | 3.5|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"variance"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="var_samp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.var_samp.html#pyspark.sql.functions.var_samp">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">var_samp</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the unbiased sample variance of</span> |
| <span class="sd"> the values in a group.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> variance of given column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(6)</span> |
| <span class="sd"> >>> df.select(var_samp(df.id)).show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |var_samp(id)|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | 3.5|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"var_samp"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="var_pop"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.var_pop.html#pyspark.sql.functions.var_pop">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">var_pop</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the population variance of the values in a group.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> variance of given column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(6)</span> |
| <span class="sd"> >>> df.select(var_pop(df.id)).first()</span> |
| <span class="sd"> Row(var_pop(id)=2.91666...)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"var_pop"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="regr_avgx"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regr_avgx.html#pyspark.sql.functions.regr_avgx">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">regr_avgx</span><span class="p">(</span><span class="n">y</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the average of the independent variable for non-null pairs</span> |
| <span class="sd"> in a group, where `y` is the dependent variable and `x` is the independent variable.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> y : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the dependent variable.</span> |
| <span class="sd"> x : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the independent variable.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the average of the independent variable for non-null pairs in a group.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> x = (col("id") % 3).alias("x")</span> |
| <span class="sd"> >>> y = (randn(42) + x * 10).alias("y")</span> |
| <span class="sd"> >>> df = spark.range(0, 1000, 1, 1).select(x, y)</span> |
| <span class="sd"> >>> df.select(regr_avgx("y", "x")).first()</span> |
| <span class="sd"> Row(regr_avgx(y, x)=0.999)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"regr_avgx"</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="regr_avgy"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regr_avgy.html#pyspark.sql.functions.regr_avgy">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">regr_avgy</span><span class="p">(</span><span class="n">y</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the average of the dependent variable for non-null pairs</span> |
| <span class="sd"> in a group, where `y` is the dependent variable and `x` is the independent variable.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> y : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the dependent variable.</span> |
| <span class="sd"> x : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the independent variable.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the average of the dependent variable for non-null pairs in a group.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> x = (col("id") % 3).alias("x")</span> |
| <span class="sd"> >>> y = (randn(42) + x * 10).alias("y")</span> |
| <span class="sd"> >>> df = spark.range(0, 1000, 1, 1).select(x, y)</span> |
| <span class="sd"> >>> df.select(regr_avgy("y", "x")).first()</span> |
| <span class="sd"> Row(regr_avgy(y, x)=9.980732994136464)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"regr_avgy"</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="regr_count"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regr_count.html#pyspark.sql.functions.regr_count">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">regr_count</span><span class="p">(</span><span class="n">y</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the number of non-null number pairs</span> |
| <span class="sd"> in a group, where `y` is the dependent variable and `x` is the independent variable.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> y : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the dependent variable.</span> |
| <span class="sd"> x : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the independent variable.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the number of non-null number pairs in a group.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> x = (col("id") % 3).alias("x")</span> |
| <span class="sd"> >>> y = (randn(42) + x * 10).alias("y")</span> |
| <span class="sd"> >>> df = spark.range(0, 1000, 1, 1).select(x, y)</span> |
| <span class="sd"> >>> df.select(regr_count("y", "x")).first()</span> |
| <span class="sd"> Row(regr_count(y, x)=1000)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"regr_count"</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="regr_intercept"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regr_intercept.html#pyspark.sql.functions.regr_intercept">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">regr_intercept</span><span class="p">(</span><span class="n">y</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the intercept of the univariate linear regression line</span> |
| <span class="sd"> for non-null pairs in a group, where `y` is the dependent variable and</span> |
| <span class="sd"> `x` is the independent variable.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> y : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the dependent variable.</span> |
| <span class="sd"> x : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the independent variable.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the intercept of the univariate linear regression line for non-null pairs in a group.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> x = (col("id") % 3).alias("x")</span> |
| <span class="sd"> >>> y = (randn(42) + x * 10).alias("y")</span> |
| <span class="sd"> >>> df = spark.range(0, 1000, 1, 1).select(x, y)</span> |
| <span class="sd"> >>> df.select(regr_intercept("y", "x")).first()</span> |
| <span class="sd"> Row(regr_intercept(y, x)=-0.04961745990969568)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"regr_intercept"</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="regr_r2"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regr_r2.html#pyspark.sql.functions.regr_r2">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">regr_r2</span><span class="p">(</span><span class="n">y</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the coefficient of determination for non-null pairs</span> |
| <span class="sd"> in a group, where `y` is the dependent variable and `x` is the independent variable.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> y : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the dependent variable.</span> |
| <span class="sd"> x : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the independent variable.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the coefficient of determination for non-null pairs in a group.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> x = (col("id") % 3).alias("x")</span> |
| <span class="sd"> >>> y = (randn(42) + x * 10).alias("y")</span> |
| <span class="sd"> >>> df = spark.range(0, 1000, 1, 1).select(x, y)</span> |
| <span class="sd"> >>> df.select(regr_r2("y", "x")).first()</span> |
| <span class="sd"> Row(regr_r2(y, x)=0.9851908293645436)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"regr_r2"</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="regr_slope"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regr_slope.html#pyspark.sql.functions.regr_slope">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">regr_slope</span><span class="p">(</span><span class="n">y</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the slope of the linear regression line for non-null pairs</span> |
| <span class="sd"> in a group, where `y` is the dependent variable and `x` is the independent variable.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> y : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the dependent variable.</span> |
| <span class="sd"> x : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the independent variable.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the slope of the linear regression line for non-null pairs in a group.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> x = (col("id") % 3).alias("x")</span> |
| <span class="sd"> >>> y = (randn(42) + x * 10).alias("y")</span> |
| <span class="sd"> >>> df = spark.range(0, 1000, 1, 1).select(x, y)</span> |
| <span class="sd"> >>> df.select(regr_slope("y", "x")).first()</span> |
| <span class="sd"> Row(regr_slope(y, x)=10.040390844891048)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"regr_slope"</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="regr_sxx"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regr_sxx.html#pyspark.sql.functions.regr_sxx">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">regr_sxx</span><span class="p">(</span><span class="n">y</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns REGR_COUNT(y, x) * VAR_POP(x) for non-null pairs</span> |
| <span class="sd"> in a group, where `y` is the dependent variable and `x` is the independent variable.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> y : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the dependent variable.</span> |
| <span class="sd"> x : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the independent variable.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> REGR_COUNT(y, x) * VAR_POP(x) for non-null pairs in a group.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> x = (col("id") % 3).alias("x")</span> |
| <span class="sd"> >>> y = (randn(42) + x * 10).alias("y")</span> |
| <span class="sd"> >>> df = spark.range(0, 1000, 1, 1).select(x, y)</span> |
| <span class="sd"> >>> df.select(regr_sxx("y", "x")).first()</span> |
| <span class="sd"> Row(regr_sxx(y, x)=666.9989999999996)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"regr_sxx"</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="regr_sxy"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regr_sxy.html#pyspark.sql.functions.regr_sxy">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">regr_sxy</span><span class="p">(</span><span class="n">y</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns REGR_COUNT(y, x) * COVAR_POP(y, x) for non-null pairs</span> |
| <span class="sd"> in a group, where `y` is the dependent variable and `x` is the independent variable.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> y : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the dependent variable.</span> |
| <span class="sd"> x : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the independent variable.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> REGR_COUNT(y, x) * COVAR_POP(y, x) for non-null pairs in a group.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> x = (col("id") % 3).alias("x")</span> |
| <span class="sd"> >>> y = (randn(42) + x * 10).alias("y")</span> |
| <span class="sd"> >>> df = spark.range(0, 1000, 1, 1).select(x, y)</span> |
| <span class="sd"> >>> df.select(regr_sxy("y", "x")).first()</span> |
| <span class="sd"> Row(regr_sxy(y, x)=6696.93065315148)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"regr_sxy"</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="regr_syy"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regr_syy.html#pyspark.sql.functions.regr_syy">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">regr_syy</span><span class="p">(</span><span class="n">y</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">x</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns REGR_COUNT(y, x) * VAR_POP(y) for non-null pairs</span> |
| <span class="sd"> in a group, where `y` is the dependent variable and `x` is the independent variable.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> y : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the dependent variable.</span> |
| <span class="sd"> x : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the independent variable.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> REGR_COUNT(y, x) * VAR_POP(y) for non-null pairs in a group.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> x = (col("id") % 3).alias("x")</span> |
| <span class="sd"> >>> y = (randn(42) + x * 10).alias("y")</span> |
| <span class="sd"> >>> df = spark.range(0, 1000, 1, 1).select(x, y)</span> |
| <span class="sd"> >>> df.select(regr_syy("y", "x")).first()</span> |
| <span class="sd"> Row(regr_syy(y, x)=68250.53503811295)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"regr_syy"</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">x</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="every"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.every.html#pyspark.sql.functions.every">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">every</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns true if all values of `col` are true.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column to check if all values are true.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> true if all values of `col` are true, false otherwise.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [[True], [True], [True]], ["flag"]</span> |
| <span class="sd"> ... ).select(sf.every("flag")).show()</span> |
| <span class="sd"> +-----------+</span> |
| <span class="sd"> |every(flag)|</span> |
| <span class="sd"> +-----------+</span> |
| <span class="sd"> | true|</span> |
| <span class="sd"> +-----------+</span> |
| |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [[True], [False], [True]], ["flag"]</span> |
| <span class="sd"> ... ).select(sf.every("flag")).show()</span> |
| <span class="sd"> +-----------+</span> |
| <span class="sd"> |every(flag)|</span> |
| <span class="sd"> +-----------+</span> |
| <span class="sd"> | false|</span> |
| <span class="sd"> +-----------+</span> |
| |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [[False], [False], [False]], ["flag"]</span> |
| <span class="sd"> ... ).select(sf.every("flag")).show()</span> |
| <span class="sd"> +-----------+</span> |
| <span class="sd"> |every(flag)|</span> |
| <span class="sd"> +-----------+</span> |
| <span class="sd"> | false|</span> |
| <span class="sd"> +-----------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"every"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="bool_and"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bool_and.html#pyspark.sql.functions.bool_and">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">bool_and</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns true if all values of `col` are true.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column to check if all values are true.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> true if all values of `col` are true, false otherwise.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[True], [True], [True]], ["flag"])</span> |
| <span class="sd"> >>> df.select(bool_and("flag")).show()</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> |bool_and(flag)|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> | true|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[True], [False], [True]], ["flag"])</span> |
| <span class="sd"> >>> df.select(bool_and("flag")).show()</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> |bool_and(flag)|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> | false|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[False], [False], [False]], ["flag"])</span> |
| <span class="sd"> >>> df.select(bool_and("flag")).show()</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> |bool_and(flag)|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> | false|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"bool_and"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="some"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.some.html#pyspark.sql.functions.some">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">some</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns true if at least one value of `col` is true.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column to check if at least one value is true.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> true if at least one value of `col` is true, false otherwise.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [[True], [True], [True]], ["flag"]</span> |
| <span class="sd"> ... ).select(sf.some("flag")).show()</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> |some(flag)|</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> | true|</span> |
| <span class="sd"> +----------+</span> |
| |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [[True], [False], [True]], ["flag"]</span> |
| <span class="sd"> ... ).select(sf.some("flag")).show()</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> |some(flag)|</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> | true|</span> |
| <span class="sd"> +----------+</span> |
| |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [[False], [False], [False]], ["flag"]</span> |
| <span class="sd"> ... ).select(sf.some("flag")).show()</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> |some(flag)|</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> | false|</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"some"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="bool_or"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bool_or.html#pyspark.sql.functions.bool_or">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">bool_or</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns true if at least one value of `col` is true.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column to check if at least one value is true.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> true if at least one value of `col` is true, false otherwise.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[True], [True], [True]], ["flag"])</span> |
| <span class="sd"> >>> df.select(bool_or("flag")).show()</span> |
| <span class="sd"> +-------------+</span> |
| <span class="sd"> |bool_or(flag)|</span> |
| <span class="sd"> +-------------+</span> |
| <span class="sd"> | true|</span> |
| <span class="sd"> +-------------+</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[True], [False], [True]], ["flag"])</span> |
| <span class="sd"> >>> df.select(bool_or("flag")).show()</span> |
| <span class="sd"> +-------------+</span> |
| <span class="sd"> |bool_or(flag)|</span> |
| <span class="sd"> +-------------+</span> |
| <span class="sd"> | true|</span> |
| <span class="sd"> +-------------+</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[False], [False], [False]], ["flag"])</span> |
| <span class="sd"> >>> df.select(bool_or("flag")).show()</span> |
| <span class="sd"> +-------------+</span> |
| <span class="sd"> |bool_or(flag)|</span> |
| <span class="sd"> +-------------+</span> |
| <span class="sd"> | false|</span> |
| <span class="sd"> +-------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"bool_or"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="bit_and"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bit_and.html#pyspark.sql.functions.bit_and">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">bit_and</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the bitwise AND of all non-null input values, or null if none.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the bitwise AND of all non-null input values, or null if none.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[1],[1],[2]], ["c"])</span> |
| <span class="sd"> >>> df.select(bit_and("c")).first()</span> |
| <span class="sd"> Row(bit_and(c)=0)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"bit_and"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="bit_or"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bit_or.html#pyspark.sql.functions.bit_or">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">bit_or</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the bitwise OR of all non-null input values, or null if none.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the bitwise OR of all non-null input values, or null if none.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[1],[1],[2]], ["c"])</span> |
| <span class="sd"> >>> df.select(bit_or("c")).first()</span> |
| <span class="sd"> Row(bit_or(c)=3)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"bit_or"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="bit_xor"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bit_xor.html#pyspark.sql.functions.bit_xor">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">bit_xor</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the bitwise XOR of all non-null input values, or null if none.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the bitwise XOR of all non-null input values, or null if none.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[1],[1],[2]], ["c"])</span> |
| <span class="sd"> >>> df.select(bit_xor("c")).first()</span> |
| <span class="sd"> Row(bit_xor(c)=2)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"bit_xor"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="skewness"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.skewness.html#pyspark.sql.functions.skewness">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">skewness</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the skewness of the values in a group.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> skewness of given column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[1],[1],[2]], ["c"])</span> |
| <span class="sd"> >>> df.select(skewness(df.c)).first()</span> |
| <span class="sd"> Row(skewness(c)=0.70710...)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"skewness"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="kurtosis"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.kurtosis.html#pyspark.sql.functions.kurtosis">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">kurtosis</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the kurtosis of the values in a group.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> kurtosis of given column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[1],[1],[2]], ["c"])</span> |
| <span class="sd"> >>> df.select(kurtosis(df.c)).show()</span> |
| <span class="sd"> +-----------+</span> |
| <span class="sd"> |kurtosis(c)|</span> |
| <span class="sd"> +-----------+</span> |
| <span class="sd"> | -1.5|</span> |
| <span class="sd"> +-----------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"kurtosis"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="collect_list"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.collect_list.html#pyspark.sql.functions.collect_list">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">collect_list</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns a list of objects with duplicates.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The function is non-deterministic because the order of collected results depends</span> |
| <span class="sd"> on the order of the rows which may be non-deterministic after a shuffle.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> list of objects with duplicates.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([(2,), (5,), (5,)], ('age',))</span> |
| <span class="sd"> >>> df2.agg(collect_list('age')).collect()</span> |
| <span class="sd"> [Row(collect_list(age)=[2, 5, 5])]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"collect_list"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="array_agg"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_agg.html#pyspark.sql.functions.array_agg">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">array_agg</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns a list of objects with duplicates.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> list of objects with duplicates.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[1],[1],[2]], ["c"])</span> |
| <span class="sd"> >>> df.agg(array_agg('c').alias('r')).collect()</span> |
| <span class="sd"> [Row(r=[1, 1, 2])]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"array_agg"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="collect_set"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.collect_set.html#pyspark.sql.functions.collect_set">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">collect_set</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns a set of objects with duplicate elements eliminated.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The function is non-deterministic because the order of collected results depends</span> |
| <span class="sd"> on the order of the rows which may be non-deterministic after a shuffle.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> list of objects with no duplicates.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([(2,), (5,), (5,)], ('age',))</span> |
| <span class="sd"> >>> df2.agg(array_sort(collect_set('age')).alias('c')).collect()</span> |
| <span class="sd"> [Row(c=[2, 5])]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"collect_set"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="degrees"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.degrees.html#pyspark.sql.functions.degrees">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">degrees</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Converts an angle measured in radians to an approximately equivalent angle</span> |
| <span class="sd"> measured in degrees.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> angle in radians</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> angle in degrees, as if computed by `java.lang.Math.toDegrees()`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import math</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(degrees(lit(math.pi))).first()</span> |
| <span class="sd"> Row(DEGREES(3.14159...)=180.0)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"degrees"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="radians"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.radians.html#pyspark.sql.functions.radians">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">radians</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Converts an angle measured in degrees to an approximately equivalent angle</span> |
| <span class="sd"> measured in radians.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> angle in degrees</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> angle in radians, as if computed by `java.lang.Math.toRadians()`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(radians(lit(180))).first()</span> |
| <span class="sd"> Row(RADIANS(180)=3.14159...)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"radians"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="atan2"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.atan2.html#pyspark.sql.functions.atan2">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">atan2</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">float</span><span class="p">],</span> <span class="n">col2</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">float</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : str, :class:`~pyspark.sql.Column` or float</span> |
| <span class="sd"> coordinate on y-axis</span> |
| <span class="sd"> col2 : str, :class:`~pyspark.sql.Column` or float</span> |
| <span class="sd"> coordinate on x-axis</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the `theta` component of the point</span> |
| <span class="sd"> (`r`, `theta`)</span> |
| <span class="sd"> in polar coordinates that corresponds to the point</span> |
| <span class="sd"> (`x`, `y`) in Cartesian coordinates,</span> |
| <span class="sd"> as if computed by `java.lang.Math.atan2()`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(atan2(lit(1), lit(2))).first()</span> |
| <span class="sd"> Row(ATAN2(1, 2)=0.46364...)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_binary_math_function</span><span class="p">(</span><span class="s2">"atan2"</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="hypot"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.hypot.html#pyspark.sql.functions.hypot">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">hypot</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">float</span><span class="p">],</span> <span class="n">col2</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">float</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes ``sqrt(a^2 + b^2)`` without intermediate overflow or underflow.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : str, :class:`~pyspark.sql.Column` or float</span> |
| <span class="sd"> a leg.</span> |
| <span class="sd"> col2 : str, :class:`~pyspark.sql.Column` or float</span> |
| <span class="sd"> b leg.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> length of the hypotenuse.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(hypot(lit(1), lit(2))).first()</span> |
| <span class="sd"> Row(HYPOT(1, 2)=2.23606...)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_binary_math_function</span><span class="p">(</span><span class="s2">"hypot"</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="pow"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.pow.html#pyspark.sql.functions.pow">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">pow</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">float</span><span class="p">],</span> <span class="n">col2</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">float</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the value of the first argument raised to the power of the second argument.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : str, :class:`~pyspark.sql.Column` or float</span> |
| <span class="sd"> the base number.</span> |
| <span class="sd"> col2 : str, :class:`~pyspark.sql.Column` or float</span> |
| <span class="sd"> the exponent number.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the base rased to the power the argument.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(pow(lit(3), lit(2))).first()</span> |
| <span class="sd"> Row(POWER(3, 2)=9.0)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_binary_math_function</span><span class="p">(</span><span class="s2">"pow"</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div> |
| |
| |
| <span class="n">power</span> <span class="o">=</span> <span class="nb">pow</span> |
| |
| |
| <div class="viewcode-block" id="pmod"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.pmod.html#pyspark.sql.functions.pmod">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">pmod</span><span class="p">(</span><span class="n">dividend</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">float</span><span class="p">],</span> <span class="n">divisor</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">float</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the positive value of dividend mod divisor.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> dividend : str, :class:`~pyspark.sql.Column` or float</span> |
| <span class="sd"> the column that contains dividend, or the specified dividend value</span> |
| <span class="sd"> divisor : str, :class:`~pyspark.sql.Column` or float</span> |
| <span class="sd"> the column that contains divisor, or the specified divisor value</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> positive value of dividend mod divisor.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.functions import pmod</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... (1.0, float('nan')), (float('nan'), 2.0), (10.0, 3.0),</span> |
| <span class="sd"> ... (float('nan'), float('nan')), (-3.0, 4.0), (-10.0, 3.0),</span> |
| <span class="sd"> ... (-5.0, -6.0), (7.0, -8.0), (1.0, 2.0)],</span> |
| <span class="sd"> ... ("a", "b"))</span> |
| <span class="sd"> >>> df.select(pmod("a", "b")).show()</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> |pmod(a, b)|</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> | NaN|</span> |
| <span class="sd"> | NaN|</span> |
| <span class="sd"> | 1.0|</span> |
| <span class="sd"> | NaN|</span> |
| <span class="sd"> | 1.0|</span> |
| <span class="sd"> | 2.0|</span> |
| <span class="sd"> | -5.0|</span> |
| <span class="sd"> | 7.0|</span> |
| <span class="sd"> | 1.0|</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_binary_math_function</span><span class="p">(</span><span class="s2">"pmod"</span><span class="p">,</span> <span class="n">dividend</span><span class="p">,</span> <span class="n">divisor</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="width_bucket"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.width_bucket.html#pyspark.sql.functions.width_bucket">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">width_bucket</span><span class="p">(</span> |
| <span class="n">v</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="nb">min</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="nb">max</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">numBucket</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the bucket number into which the value of this expression would fall</span> |
| <span class="sd"> after being evaluated. Note that input arguments must follow conditions listed below;</span> |
| <span class="sd"> otherwise, the method will return null.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> v : str or :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> value to compute a bucket number in the histogram</span> |
| <span class="sd"> min : str or :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> minimum value of the histogram</span> |
| <span class="sd"> max : str or :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> maximum value of the histogram</span> |
| <span class="sd"> numBucket : str, :class:`~pyspark.sql.Column` or int</span> |
| <span class="sd"> the number of buckets</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the bucket number into which the value would fall after being evaluated</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... (5.3, 0.2, 10.6, 5),</span> |
| <span class="sd"> ... (-2.1, 1.3, 3.4, 3),</span> |
| <span class="sd"> ... (8.1, 0.0, 5.7, 4),</span> |
| <span class="sd"> ... (-0.9, 5.2, 0.5, 2)],</span> |
| <span class="sd"> ... ['v', 'min', 'max', 'n'])</span> |
| <span class="sd"> >>> df.select(width_bucket('v', 'min', 'max', 'n')).show()</span> |
| <span class="sd"> +----------------------------+</span> |
| <span class="sd"> |width_bucket(v, min, max, n)|</span> |
| <span class="sd"> +----------------------------+</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> | 5|</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> +----------------------------+</span> |
| <span class="sd"> """</span> |
| <span class="n">numBucket</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">numBucket</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">numBucket</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">numBucket</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"width_bucket"</span><span class="p">,</span> <span class="n">v</span><span class="p">,</span> <span class="nb">min</span><span class="p">,</span> <span class="nb">max</span><span class="p">,</span> <span class="n">numBucket</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="row_number"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.row_number.html#pyspark.sql.functions.row_number">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">row_number</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Window function: returns a sequential number starting at 1 within a window partition.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for calculating row numbers.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Window</span> |
| <span class="sd"> >>> df = spark.range(3)</span> |
| <span class="sd"> >>> w = Window.orderBy(df.id.desc())</span> |
| <span class="sd"> >>> df.withColumn("desc_order", row_number().over(w)).show()</span> |
| <span class="sd"> +---+----------+</span> |
| <span class="sd"> | id|desc_order|</span> |
| <span class="sd"> +---+----------+</span> |
| <span class="sd"> | 2| 1|</span> |
| <span class="sd"> | 1| 2|</span> |
| <span class="sd"> | 0| 3|</span> |
| <span class="sd"> +---+----------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"row_number"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="dense_rank"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.dense_rank.html#pyspark.sql.functions.dense_rank">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">dense_rank</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Window function: returns the rank of rows within a window partition, without any gaps.</span> |
| |
| <span class="sd"> The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking</span> |
| <span class="sd"> sequence when there are ties. That is, if you were ranking a competition using dense_rank</span> |
| <span class="sd"> and had three people tie for second place, you would say that all three were in second</span> |
| <span class="sd"> place and that the next person came in third. Rank would give me sequential numbers, making</span> |
| <span class="sd"> the person that came in third place (after the ties) would register as coming in fifth.</span> |
| |
| <span class="sd"> This is equivalent to the DENSE_RANK function in SQL.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for calculating ranks.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Window, types</span> |
| <span class="sd"> >>> df = spark.createDataFrame([1, 1, 2, 3, 3, 4], types.IntegerType())</span> |
| <span class="sd"> >>> w = Window.orderBy("value")</span> |
| <span class="sd"> >>> df.withColumn("drank", dense_rank().over(w)).show()</span> |
| <span class="sd"> +-----+-----+</span> |
| <span class="sd"> |value|drank|</span> |
| <span class="sd"> +-----+-----+</span> |
| <span class="sd"> | 1| 1|</span> |
| <span class="sd"> | 1| 1|</span> |
| <span class="sd"> | 2| 2|</span> |
| <span class="sd"> | 3| 3|</span> |
| <span class="sd"> | 3| 3|</span> |
| <span class="sd"> | 4| 4|</span> |
| <span class="sd"> +-----+-----+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"dense_rank"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="rank"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.rank.html#pyspark.sql.functions.rank">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">rank</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Window function: returns the rank of rows within a window partition.</span> |
| |
| <span class="sd"> The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking</span> |
| <span class="sd"> sequence when there are ties. That is, if you were ranking a competition using dense_rank</span> |
| <span class="sd"> and had three people tie for second place, you would say that all three were in second</span> |
| <span class="sd"> place and that the next person came in third. Rank would give me sequential numbers, making</span> |
| <span class="sd"> the person that came in third place (after the ties) would register as coming in fifth.</span> |
| |
| <span class="sd"> This is equivalent to the RANK function in SQL.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for calculating ranks.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Window, types</span> |
| <span class="sd"> >>> df = spark.createDataFrame([1, 1, 2, 3, 3, 4], types.IntegerType())</span> |
| <span class="sd"> >>> w = Window.orderBy("value")</span> |
| <span class="sd"> >>> df.withColumn("drank", rank().over(w)).show()</span> |
| <span class="sd"> +-----+-----+</span> |
| <span class="sd"> |value|drank|</span> |
| <span class="sd"> +-----+-----+</span> |
| <span class="sd"> | 1| 1|</span> |
| <span class="sd"> | 1| 1|</span> |
| <span class="sd"> | 2| 3|</span> |
| <span class="sd"> | 3| 4|</span> |
| <span class="sd"> | 3| 4|</span> |
| <span class="sd"> | 4| 6|</span> |
| <span class="sd"> +-----+-----+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"rank"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="cume_dist"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.cume_dist.html#pyspark.sql.functions.cume_dist">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">cume_dist</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Window function: returns the cumulative distribution of values within a window partition,</span> |
| <span class="sd"> i.e. the fraction of rows that are below the current row.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for calculating cumulative distribution.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Window, types</span> |
| <span class="sd"> >>> df = spark.createDataFrame([1, 2, 3, 3, 4], types.IntegerType())</span> |
| <span class="sd"> >>> w = Window.orderBy("value")</span> |
| <span class="sd"> >>> df.withColumn("cd", cume_dist().over(w)).show()</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> |value| cd|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> | 1|0.2|</span> |
| <span class="sd"> | 2|0.4|</span> |
| <span class="sd"> | 3|0.8|</span> |
| <span class="sd"> | 3|0.8|</span> |
| <span class="sd"> | 4|1.0|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"cume_dist"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="percent_rank"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.percent_rank.html#pyspark.sql.functions.percent_rank">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">percent_rank</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Window function: returns the relative rank (i.e. percentile) of rows within a window partition.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for calculating relative rank.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Window, types</span> |
| <span class="sd"> >>> df = spark.createDataFrame([1, 1, 2, 3, 3, 4], types.IntegerType())</span> |
| <span class="sd"> >>> w = Window.orderBy("value")</span> |
| <span class="sd"> >>> df.withColumn("pr", percent_rank().over(w)).show()</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> |value| pr|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> | 1|0.0|</span> |
| <span class="sd"> | 1|0.0|</span> |
| <span class="sd"> | 2|0.4|</span> |
| <span class="sd"> | 3|0.6|</span> |
| <span class="sd"> | 3|0.6|</span> |
| <span class="sd"> | 4|1.0|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"percent_rank"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="approxCountDistinct"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.approxCountDistinct.html#pyspark.sql.functions.approxCountDistinct">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">approxCountDistinct</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">rsd</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> .. deprecated:: 2.1.0</span> |
| <span class="sd"> Use :func:`approx_count_distinct` instead.</span> |
| <span class="sd"> """</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">"Deprecated in 2.1, use approx_count_distinct instead."</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">approx_count_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">rsd</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="approx_count_distinct"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.approx_count_distinct.html#pyspark.sql.functions.approx_count_distinct">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">approx_count_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">rsd</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Aggregate function: returns a new :class:`~pyspark.sql.Column` for approximate distinct count</span> |
| <span class="sd"> of column `col`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> rsd : float, optional</span> |
| <span class="sd"> maximum relative standard deviation allowed (default = 0.05).</span> |
| <span class="sd"> For rsd < 0.01, it is more efficient to use :func:`count_distinct`</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column of computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([1,2,2,3], "INT")</span> |
| <span class="sd"> >>> df.agg(approx_count_distinct("value").alias('distinct_values')).show()</span> |
| <span class="sd"> +---------------+</span> |
| <span class="sd"> |distinct_values|</span> |
| <span class="sd"> +---------------+</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> +---------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">rsd</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"approx_count_distinct"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"approx_count_distinct"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">rsd</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="broadcast"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.broadcast.html#pyspark.sql.functions.broadcast">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">broadcast</span><span class="p">(</span><span class="n">df</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Marks a DataFrame as small enough for use in broadcast joins.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.DataFrame`</span> |
| <span class="sd"> DataFrame marked as ready for broadcast join.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import types</span> |
| <span class="sd"> >>> df = spark.createDataFrame([1, 2, 3, 3, 4], types.IntegerType())</span> |
| <span class="sd"> >>> df_small = spark.range(3)</span> |
| <span class="sd"> >>> df_b = broadcast(df_small)</span> |
| <span class="sd"> >>> df.join(df_b, df.value == df_small.id).show()</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> |value| id|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> | 1| 1|</span> |
| <span class="sd"> | 2| 2|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">JVMView</span><span class="p">,</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="p">)</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="n">broadcast</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">_jdf</span><span class="p">),</span> <span class="n">df</span><span class="o">.</span><span class="n">sparkSession</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="coalesce"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.coalesce.html#pyspark.sql.functions.coalesce">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">coalesce</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the first column that is not null.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> list of columns to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> value of the first column that is not null.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> cDf = spark.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b"))</span> |
| <span class="sd"> >>> cDf.show()</span> |
| <span class="sd"> +----+----+</span> |
| <span class="sd"> | a| b|</span> |
| <span class="sd"> +----+----+</span> |
| <span class="sd"> |NULL|NULL|</span> |
| <span class="sd"> | 1|NULL|</span> |
| <span class="sd"> |NULL| 2|</span> |
| <span class="sd"> +----+----+</span> |
| |
| <span class="sd"> >>> cDf.select(coalesce(cDf["a"], cDf["b"])).show()</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> |coalesce(a, b)|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> | NULL|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> +--------------+</span> |
| |
| <span class="sd"> >>> cDf.select('*', coalesce(cDf["a"], lit(0.0))).show()</span> |
| <span class="sd"> +----+----+----------------+</span> |
| <span class="sd"> | a| b|coalesce(a, 0.0)|</span> |
| <span class="sd"> +----+----+----------------+</span> |
| <span class="sd"> |NULL|NULL| 0.0|</span> |
| <span class="sd"> | 1|NULL| 1.0|</span> |
| <span class="sd"> |NULL| 2| 0.0|</span> |
| <span class="sd"> +----+----+----------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">"coalesce"</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="corr"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.corr.html#pyspark.sql.functions.corr">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">corr</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a new :class:`~pyspark.sql.Column` for the Pearson Correlation Coefficient for</span> |
| <span class="sd"> ``col1`` and ``col2``.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> first column to calculate correlation.</span> |
| <span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> second column to calculate correlation.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> Pearson Correlation Coefficient of these two column values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> a = range(20)</span> |
| <span class="sd"> >>> b = [2 * x for x in range(20)]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(zip(a, b), ["a", "b"])</span> |
| <span class="sd"> >>> df.agg(corr("a", "b").alias('c')).collect()</span> |
| <span class="sd"> [Row(c=1.0)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"corr"</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="covar_pop"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.covar_pop.html#pyspark.sql.functions.covar_pop">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">covar_pop</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a new :class:`~pyspark.sql.Column` for the population covariance of ``col1`` and</span> |
| <span class="sd"> ``col2``.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> first column to calculate covariance.</span> |
| <span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> second column to calculate covariance.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> covariance of these two column values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> a = [1] * 10</span> |
| <span class="sd"> >>> b = [1] * 10</span> |
| <span class="sd"> >>> df = spark.createDataFrame(zip(a, b), ["a", "b"])</span> |
| <span class="sd"> >>> df.agg(covar_pop("a", "b").alias('c')).collect()</span> |
| <span class="sd"> [Row(c=0.0)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"covar_pop"</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="covar_samp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.covar_samp.html#pyspark.sql.functions.covar_samp">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">covar_samp</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a new :class:`~pyspark.sql.Column` for the sample covariance of ``col1`` and</span> |
| <span class="sd"> ``col2``.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> first column to calculate covariance.</span> |
| <span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> second column to calculate covariance.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> sample covariance of these two column values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> a = [1] * 10</span> |
| <span class="sd"> >>> b = [1] * 10</span> |
| <span class="sd"> >>> df = spark.createDataFrame(zip(a, b), ["a", "b"])</span> |
| <span class="sd"> >>> df.agg(covar_samp("a", "b").alias('c')).collect()</span> |
| <span class="sd"> [Row(c=0.0)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"covar_samp"</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="countDistinct"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.countDistinct.html#pyspark.sql.functions.countDistinct">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">countDistinct</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a new :class:`~pyspark.sql.Column` for distinct count of ``col`` or ``cols``.</span> |
| |
| <span class="sd"> An alias of :func:`count_distinct`, and it is encouraged to use :func:`count_distinct`</span> |
| <span class="sd"> directly.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">count_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="count_distinct"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.count_distinct.html#pyspark.sql.functions.count_distinct">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">count_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a new :class:`Column` for distinct count of ``col`` or ``cols``.</span> |
| |
| <span class="sd"> .. versionadded:: 3.2.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> first column to compute on.</span> |
| <span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> other columns to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> distinct values of these two column values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import types</span> |
| <span class="sd"> >>> df1 = spark.createDataFrame([1, 1, 3], types.IntegerType())</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([1, 2], types.IntegerType())</span> |
| <span class="sd"> >>> df1.join(df2).show()</span> |
| <span class="sd"> +-----+-----+</span> |
| <span class="sd"> |value|value|</span> |
| <span class="sd"> +-----+-----+</span> |
| <span class="sd"> | 1| 1|</span> |
| <span class="sd"> | 1| 2|</span> |
| <span class="sd"> | 1| 1|</span> |
| <span class="sd"> | 1| 2|</span> |
| <span class="sd"> | 3| 1|</span> |
| <span class="sd"> | 3| 2|</span> |
| <span class="sd"> +-----+-----+</span> |
| <span class="sd"> >>> df1.join(df2).select(count_distinct(df1.value, df2.value)).show()</span> |
| <span class="sd"> +----------------------------+</span> |
| <span class="sd"> |count(DISTINCT value, value)|</span> |
| <span class="sd"> +----------------------------+</span> |
| <span class="sd"> | 4|</span> |
| <span class="sd"> +----------------------------+</span> |
| <span class="sd"> """</span> |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span> |
| <span class="s2">"count_distinct"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">)</span> |
| <span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="first"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.first.html#pyspark.sql.functions.first">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">first</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">ignorenulls</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Aggregate function: returns the first value in a group.</span> |
| |
| <span class="sd"> The function by default returns the first values it sees. It will return the first non-null</span> |
| <span class="sd"> value it sees when ignoreNulls is set to true. If all values are null, then null is returned.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The function is non-deterministic because its results depends on the order of the</span> |
| <span class="sd"> rows which may be non-deterministic after a shuffle.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column to fetch first value for.</span> |
| <span class="sd"> ignorenulls : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> if first value is null then look for first non-null value.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> first value of the group.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5), ("Alice", None)], ("name", "age"))</span> |
| <span class="sd"> >>> df = df.orderBy(df.age)</span> |
| <span class="sd"> >>> df.groupby("name").agg(first("age")).orderBy("name").show()</span> |
| <span class="sd"> +-----+----------+</span> |
| <span class="sd"> | name|first(age)|</span> |
| <span class="sd"> +-----+----------+</span> |
| <span class="sd"> |Alice| NULL|</span> |
| <span class="sd"> | Bob| 5|</span> |
| <span class="sd"> +-----+----------+</span> |
| |
| <span class="sd"> Now, to ignore any nulls we needs to set ``ignorenulls`` to `True`</span> |
| |
| <span class="sd"> >>> df.groupby("name").agg(first("age", ignorenulls=True)).orderBy("name").show()</span> |
| <span class="sd"> +-----+----------+</span> |
| <span class="sd"> | name|first(age)|</span> |
| <span class="sd"> +-----+----------+</span> |
| <span class="sd"> |Alice| 2|</span> |
| <span class="sd"> | Bob| 5|</span> |
| <span class="sd"> +-----+----------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"first"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">ignorenulls</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="grouping"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.grouping.html#pyspark.sql.functions.grouping">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">grouping</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: indicates whether a specified column in a GROUP BY list is aggregated</span> |
| <span class="sd"> or not, returns 1 for aggregated or 0 for not aggregated in the result set.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column to check if it's aggregated.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> returns 1 for aggregated or 0 for not aggregated in the result set.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age"))</span> |
| <span class="sd"> >>> df.cube("name").agg(grouping("name"), sum("age")).orderBy("name").show()</span> |
| <span class="sd"> +-----+--------------+--------+</span> |
| <span class="sd"> | name|grouping(name)|sum(age)|</span> |
| <span class="sd"> +-----+--------------+--------+</span> |
| <span class="sd"> | NULL| 1| 7|</span> |
| <span class="sd"> |Alice| 0| 2|</span> |
| <span class="sd"> | Bob| 0| 5|</span> |
| <span class="sd"> +-----+--------------+--------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"grouping"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="grouping_id"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.grouping_id.html#pyspark.sql.functions.grouping_id">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">grouping_id</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the level of grouping, equals to</span> |
| |
| <span class="sd"> (grouping(c1) << (n-1)) + (grouping(c2) << (n-2)) + ... + grouping(cn)</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The list of columns should match with grouping columns exactly, or empty (means all</span> |
| <span class="sd"> the grouping columns).</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> columns to check for.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> returns level of the grouping it relates to.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, "a", "a"),</span> |
| <span class="sd"> ... (3, "a", "a"),</span> |
| <span class="sd"> ... (4, "b", "c")], ["c1", "c2", "c3"])</span> |
| <span class="sd"> >>> df.cube("c2", "c3").agg(grouping_id(), sum("c1")).orderBy("c2", "c3").show()</span> |
| <span class="sd"> +----+----+-------------+-------+</span> |
| <span class="sd"> | c2| c3|grouping_id()|sum(c1)|</span> |
| <span class="sd"> +----+----+-------------+-------+</span> |
| <span class="sd"> |NULL|NULL| 3| 8|</span> |
| <span class="sd"> |NULL| a| 2| 4|</span> |
| <span class="sd"> |NULL| c| 2| 4|</span> |
| <span class="sd"> | a|NULL| 1| 4|</span> |
| <span class="sd"> | a| a| 0| 4|</span> |
| <span class="sd"> | b|NULL| 1| 4|</span> |
| <span class="sd"> | b| c| 0| 4|</span> |
| <span class="sd"> +----+----+-------------+-------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">"grouping_id"</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="count_min_sketch"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.count_min_sketch.html#pyspark.sql.functions.count_min_sketch">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">count_min_sketch</span><span class="p">(</span> |
| <span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">eps</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">confidence</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a count-min sketch of a column with the given esp, confidence and seed.</span> |
| <span class="sd"> The result is an array of bytes, which can be deserialized to a `CountMinSketch` before usage.</span> |
| <span class="sd"> Count-min sketch is a probabilistic data structure used for cardinality estimation</span> |
| <span class="sd"> using sub-linear space.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| <span class="sd"> eps : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> relative error, must be positive</span> |
| <span class="sd"> confidence : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> confidence, must be positive and less than 1.0</span> |
| <span class="sd"> seed : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> random seed</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> count-min sketch of the column</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[1], [2], [1]], ['data'])</span> |
| <span class="sd"> >>> df = df.agg(count_min_sketch(df.data, lit(0.5), lit(0.5), lit(1)).alias('sketch'))</span> |
| <span class="sd"> >>> df.select(hex(df.sketch).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='0000000100000000000000030000000100000004000000005D8D6AB90000000000000000000000000000000200000000000000010000000000000000')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"count_min_sketch"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">eps</span><span class="p">,</span> <span class="n">confidence</span><span class="p">,</span> <span class="n">seed</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="input_file_name"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.input_file_name.html#pyspark.sql.functions.input_file_name">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">input_file_name</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Creates a string column for the file name of the current Spark task.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> file names.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import os</span> |
| <span class="sd"> >>> path = os.path.abspath(__file__)</span> |
| <span class="sd"> >>> df = spark.read.text(path)</span> |
| <span class="sd"> >>> df.select(input_file_name()).first()</span> |
| <span class="sd"> Row(input_file_name()='file:///...')</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"input_file_name"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="isnan"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.isnan.html#pyspark.sql.functions.isnan">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">isnan</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""An expression that returns true if the column is NaN.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> True if value is NaN and False otherwise.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b"))</span> |
| <span class="sd"> >>> df.select("a", "b", isnan("a").alias("r1"), isnan(df.b).alias("r2")).show()</span> |
| <span class="sd"> +---+---+-----+-----+</span> |
| <span class="sd"> | a| b| r1| r2|</span> |
| <span class="sd"> +---+---+-----+-----+</span> |
| <span class="sd"> |1.0|NaN|false| true|</span> |
| <span class="sd"> |NaN|2.0| true|false|</span> |
| <span class="sd"> +---+---+-----+-----+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"isnan"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="isnull"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.isnull.html#pyspark.sql.functions.isnull">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">isnull</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""An expression that returns true if the column is null.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> True if value is null and False otherwise.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, None), (None, 2)], ("a", "b"))</span> |
| <span class="sd"> >>> df.select("a", "b", isnull("a").alias("r1"), isnull(df.b).alias("r2")).show()</span> |
| <span class="sd"> +----+----+-----+-----+</span> |
| <span class="sd"> | a| b| r1| r2|</span> |
| <span class="sd"> +----+----+-----+-----+</span> |
| <span class="sd"> | 1|NULL|false| true|</span> |
| <span class="sd"> |NULL| 2| true|false|</span> |
| <span class="sd"> +----+----+-----+-----+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"isnull"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="last"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.last.html#pyspark.sql.functions.last">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">last</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">ignorenulls</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Aggregate function: returns the last value in a group.</span> |
| |
| <span class="sd"> The function by default returns the last values it sees. It will return the last non-null</span> |
| <span class="sd"> value it sees when ignoreNulls is set to true. If all values are null, then null is returned.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The function is non-deterministic because its results depends on the order of the</span> |
| <span class="sd"> rows which may be non-deterministic after a shuffle.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column to fetch last value for.</span> |
| <span class="sd"> ignorenulls : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> if last value is null then look for non-null value.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> last value of the group.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5), ("Alice", None)], ("name", "age"))</span> |
| <span class="sd"> >>> df = df.orderBy(df.age.desc())</span> |
| <span class="sd"> >>> df.groupby("name").agg(last("age")).orderBy("name").show()</span> |
| <span class="sd"> +-----+---------+</span> |
| <span class="sd"> | name|last(age)|</span> |
| <span class="sd"> +-----+---------+</span> |
| <span class="sd"> |Alice| NULL|</span> |
| <span class="sd"> | Bob| 5|</span> |
| <span class="sd"> +-----+---------+</span> |
| |
| <span class="sd"> Now, to ignore any nulls we needs to set ``ignorenulls`` to `True`</span> |
| |
| <span class="sd"> >>> df.groupby("name").agg(last("age", ignorenulls=True)).orderBy("name").show()</span> |
| <span class="sd"> +-----+---------+</span> |
| <span class="sd"> | name|last(age)|</span> |
| <span class="sd"> +-----+---------+</span> |
| <span class="sd"> |Alice| 2|</span> |
| <span class="sd"> | Bob| 5|</span> |
| <span class="sd"> +-----+---------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"last"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">ignorenulls</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="monotonically_increasing_id"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.monotonically_increasing_id.html#pyspark.sql.functions.monotonically_increasing_id">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">monotonically_increasing_id</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""A column that generates monotonically increasing 64-bit integers.</span> |
| |
| <span class="sd"> The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive.</span> |
| <span class="sd"> The current implementation puts the partition ID in the upper 31 bits, and the record number</span> |
| <span class="sd"> within each partition in the lower 33 bits. The assumption is that the data frame has</span> |
| <span class="sd"> less than 1 billion partitions, and each partition has less than 8 billion records.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The function is non-deterministic because its result depends on partition IDs.</span> |
| |
| <span class="sd"> As an example, consider a :class:`DataFrame` with two partitions, each with 3 records.</span> |
| <span class="sd"> This expression would return the following IDs:</span> |
| <span class="sd"> 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> last value of the group.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import functions as sf</span> |
| <span class="sd"> >>> spark.range(0, 10, 1, 2).select(sf.monotonically_increasing_id()).show()</span> |
| <span class="sd"> +-----------------------------+</span> |
| <span class="sd"> |monotonically_increasing_id()|</span> |
| <span class="sd"> +-----------------------------+</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> | 4|</span> |
| <span class="sd"> | 8589934592|</span> |
| <span class="sd"> | 8589934593|</span> |
| <span class="sd"> | 8589934594|</span> |
| <span class="sd"> | 8589934595|</span> |
| <span class="sd"> | 8589934596|</span> |
| <span class="sd"> +-----------------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"monotonically_increasing_id"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="nanvl"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.nanvl.html#pyspark.sql.functions.nanvl">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">nanvl</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns col1 if it is not NaN, or col2 if col1 is NaN.</span> |
| |
| <span class="sd"> Both inputs should be floating point columns (:class:`DoubleType` or :class:`FloatType`).</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> first column to check.</span> |
| <span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> second column to return if first is NaN.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> value from first column or second if first is NaN .</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b"))</span> |
| <span class="sd"> >>> df.select(nanvl("a", "b").alias("r1"), nanvl(df.a, df.b).alias("r2")).collect()</span> |
| <span class="sd"> [Row(r1=1.0, r2=1.0), Row(r1=2.0, r2=2.0)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"nanvl"</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="percentile"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.percentile.html#pyspark.sql.functions.percentile">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">percentile</span><span class="p">(</span> |
| <span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">percentage</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">float</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">float</span><span class="p">]],</span> |
| <span class="n">frequency</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the exact percentile(s) of numeric column `expr` at the given percentage(s)</span> |
| <span class="sd"> with value range in [0.0, 1.0].</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str input column.</span> |
| <span class="sd"> percentage : :class:`~pyspark.sql.Column`, float, list of floats or tuple of floats</span> |
| <span class="sd"> percentage in decimal (must be between 0.0 and 1.0).</span> |
| <span class="sd"> frequency : :class:`~pyspark.sql.Column` or int is a positive numeric literal which</span> |
| <span class="sd"> controls frequency.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the exact `percentile` of the numeric column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> key = (col("id") % 3).alias("key")</span> |
| <span class="sd"> >>> value = (randn(42) + key * 10).alias("value")</span> |
| <span class="sd"> >>> df = spark.range(0, 1000, 1, 1).select(key, value)</span> |
| <span class="sd"> >>> df.select(</span> |
| <span class="sd"> ... percentile("value", [0.25, 0.5, 0.75], lit(1)).alias("quantiles")</span> |
| <span class="sd"> ... ).show()</span> |
| <span class="sd"> +--------------------+</span> |
| <span class="sd"> | quantiles|</span> |
| <span class="sd"> +--------------------+</span> |
| <span class="sd"> |[0.74419914941216...|</span> |
| <span class="sd"> +--------------------+</span> |
| |
| <span class="sd"> >>> df.groupBy("key").agg(</span> |
| <span class="sd"> ... percentile("value", 0.5, lit(1)).alias("median")</span> |
| <span class="sd"> ... ).show()</span> |
| <span class="sd"> +---+--------------------+</span> |
| <span class="sd"> |key| median|</span> |
| <span class="sd"> +---+--------------------+</span> |
| <span class="sd"> | 0|-0.03449962216667901|</span> |
| <span class="sd"> | 1| 9.990389751837329|</span> |
| <span class="sd"> | 2| 19.967859769284075|</span> |
| <span class="sd"> +---+--------------------+</span> |
| <span class="sd"> """</span> |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">percentage</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span> |
| <span class="c1"># A local list</span> |
| <span class="n">percentage</span> <span class="o">=</span> <span class="n">_invoke_function</span><span class="p">(</span> |
| <span class="s2">"array"</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="p">[</span><span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">percentage</span><span class="p">])</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">_jc</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">percentage</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span> |
| <span class="c1"># Already a Column</span> |
| <span class="n">percentage</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">percentage</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="c1"># Probably scalar</span> |
| <span class="n">percentage</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">percentage</span><span class="p">)</span> |
| |
| <span class="n">frequency</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">_to_java_column</span><span class="p">(</span><span class="n">frequency</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">frequency</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> |
| <span class="k">else</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">frequency</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"percentile"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">percentage</span><span class="p">,</span> <span class="n">frequency</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="percentile_approx"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.percentile_approx.html#pyspark.sql.functions.percentile_approx">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">percentile_approx</span><span class="p">(</span> |
| <span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">percentage</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">float</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">float</span><span class="p">]],</span> |
| <span class="n">accuracy</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="mi">10000</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the approximate `percentile` of the numeric column `col` which is the smallest value</span> |
| <span class="sd"> in the ordered `col` values (sorted from least to greatest) such that no more than `percentage`</span> |
| <span class="sd"> of `col` values is less than the value or equal to that value.</span> |
| |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> input column.</span> |
| <span class="sd"> percentage : :class:`~pyspark.sql.Column`, float, list of floats or tuple of floats</span> |
| <span class="sd"> percentage in decimal (must be between 0.0 and 1.0).</span> |
| <span class="sd"> When percentage is an array, each value of the percentage array must be between 0.0 and 1.0.</span> |
| <span class="sd"> In this case, returns the approximate percentile array of column col</span> |
| <span class="sd"> at the given percentage array.</span> |
| <span class="sd"> accuracy : :class:`~pyspark.sql.Column` or float</span> |
| <span class="sd"> is a positive numeric literal which controls approximation accuracy</span> |
| <span class="sd"> at the cost of memory. Higher value of accuracy yields better accuracy,</span> |
| <span class="sd"> 1.0/accuracy is the relative error of the approximation. (default: 10000).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> approximate `percentile` of the numeric column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> key = (col("id") % 3).alias("key")</span> |
| <span class="sd"> >>> value = (randn(42) + key * 10).alias("value")</span> |
| <span class="sd"> >>> df = spark.range(0, 1000, 1, 1).select(key, value)</span> |
| <span class="sd"> >>> df.select(</span> |
| <span class="sd"> ... percentile_approx("value", [0.25, 0.5, 0.75], 1000000).alias("quantiles")</span> |
| <span class="sd"> ... ).printSchema()</span> |
| <span class="sd"> root</span> |
| <span class="sd"> |-- quantiles: array (nullable = true)</span> |
| <span class="sd"> | |-- element: double (containsNull = false)</span> |
| |
| <span class="sd"> >>> df.groupBy("key").agg(</span> |
| <span class="sd"> ... percentile_approx("value", 0.5, lit(1000000)).alias("median")</span> |
| <span class="sd"> ... ).printSchema()</span> |
| <span class="sd"> root</span> |
| <span class="sd"> |-- key: long (nullable = true)</span> |
| <span class="sd"> |-- median: double (nullable = true)</span> |
| <span class="sd"> """</span> |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">percentage</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span> |
| <span class="c1"># A local list</span> |
| <span class="n">percentage</span> <span class="o">=</span> <span class="n">_invoke_function</span><span class="p">(</span> |
| <span class="s2">"array"</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="p">[</span><span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">percentage</span><span class="p">])</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">_jc</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">percentage</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span> |
| <span class="c1"># Already a Column</span> |
| <span class="n">percentage</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">percentage</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="c1"># Probably scalar</span> |
| <span class="n">percentage</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">percentage</span><span class="p">)</span> |
| |
| <span class="n">accuracy</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">_to_java_column</span><span class="p">(</span><span class="n">accuracy</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">accuracy</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> |
| <span class="k">else</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">accuracy</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"percentile_approx"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">percentage</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="approx_percentile"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.approx_percentile.html#pyspark.sql.functions.approx_percentile">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">approx_percentile</span><span class="p">(</span> |
| <span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">percentage</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">float</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">float</span><span class="p">]],</span> |
| <span class="n">accuracy</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="mi">10000</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the approximate `percentile` of the numeric column `col` which is the smallest value</span> |
| <span class="sd"> in the ordered `col` values (sorted from least to greatest) such that no more than `percentage`</span> |
| <span class="sd"> of `col` values is less than the value or equal to that value.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> input column.</span> |
| <span class="sd"> percentage : :class:`~pyspark.sql.Column`, float, list of floats or tuple of floats</span> |
| <span class="sd"> percentage in decimal (must be between 0.0 and 1.0).</span> |
| <span class="sd"> When percentage is an array, each value of the percentage array must be between 0.0 and 1.0.</span> |
| <span class="sd"> In this case, returns the approximate percentile array of column col</span> |
| <span class="sd"> at the given percentage array.</span> |
| <span class="sd"> accuracy : :class:`~pyspark.sql.Column` or float</span> |
| <span class="sd"> is a positive numeric literal which controls approximation accuracy</span> |
| <span class="sd"> at the cost of memory. Higher value of accuracy yields better accuracy,</span> |
| <span class="sd"> 1.0/accuracy is the relative error of the approximation. (default: 10000).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> approximate `percentile` of the numeric column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> key = (sf.col("id") % 3).alias("key")</span> |
| <span class="sd"> >>> value = (sf.randn(42) + key * 10).alias("value")</span> |
| <span class="sd"> >>> df = spark.range(0, 1000, 1, 1).select(key, value)</span> |
| <span class="sd"> >>> df.select(</span> |
| <span class="sd"> ... sf.approx_percentile("value", [0.25, 0.5, 0.75], 1000000)</span> |
| <span class="sd"> ... ).printSchema()</span> |
| <span class="sd"> root</span> |
| <span class="sd"> |-- approx_percentile(value, array(0.25, 0.5, 0.75), 1000000): array (nullable = true)</span> |
| <span class="sd"> | |-- element: double (containsNull = false)</span> |
| |
| <span class="sd"> >>> df.groupBy("key").agg(</span> |
| <span class="sd"> ... sf.approx_percentile("value", 0.5, sf.lit(1000000))</span> |
| <span class="sd"> ... ).printSchema()</span> |
| <span class="sd"> root</span> |
| <span class="sd"> |-- key: long (nullable = true)</span> |
| <span class="sd"> |-- approx_percentile(value, 0.5, 1000000): double (nullable = true)</span> |
| <span class="sd"> """</span> |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">percentage</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span> |
| <span class="c1"># A local list</span> |
| <span class="n">percentage</span> <span class="o">=</span> <span class="n">_invoke_function</span><span class="p">(</span> |
| <span class="s2">"array"</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="p">[</span><span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">percentage</span><span class="p">])</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">_jc</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">percentage</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span> |
| <span class="c1"># Already a Column</span> |
| <span class="n">percentage</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">percentage</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="c1"># Probably scalar</span> |
| <span class="n">percentage</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">percentage</span><span class="p">)</span> |
| |
| <span class="n">accuracy</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">_to_java_column</span><span class="p">(</span><span class="n">accuracy</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">accuracy</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> |
| <span class="k">else</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">accuracy</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"approx_percentile"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">percentage</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="rand"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.rand.html#pyspark.sql.functions.rand">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">rand</span><span class="p">(</span><span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Generates a random column with independent and identically distributed (i.i.d.) samples</span> |
| <span class="sd"> uniformly distributed in [0.0, 1.0).</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The function is non-deterministic in general case.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> seed : int (default: None)</span> |
| <span class="sd"> seed value for random generator.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> random values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import functions as sf</span> |
| <span class="sd"> >>> spark.range(0, 2, 1, 1).withColumn('rand', sf.rand(seed=42) * 3).show()</span> |
| <span class="sd"> +---+------------------+</span> |
| <span class="sd"> | id| rand|</span> |
| <span class="sd"> +---+------------------+</span> |
| <span class="sd"> | 0|1.8575681106759028|</span> |
| <span class="sd"> | 1|1.5288056527339444|</span> |
| <span class="sd"> +---+------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">seed</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"rand"</span><span class="p">,</span> <span class="n">seed</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"rand"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="randn"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.randn.html#pyspark.sql.functions.randn">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">randn</span><span class="p">(</span><span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Generates a column with independent and identically distributed (i.i.d.) samples from</span> |
| <span class="sd"> the standard normal distribution.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The function is non-deterministic in general case.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> seed : int (default: None)</span> |
| <span class="sd"> seed value for random generator.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> random values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import functions as sf</span> |
| <span class="sd"> >>> spark.range(0, 2, 1, 1).withColumn('randn', sf.randn(seed=42)).show()</span> |
| <span class="sd"> +---+------------------+</span> |
| <span class="sd"> | id| randn|</span> |
| <span class="sd"> +---+------------------+</span> |
| <span class="sd"> | 0| 2.384479054241165|</span> |
| <span class="sd"> | 1|0.1920934041293524|</span> |
| <span class="sd"> +---+------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">seed</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"randn"</span><span class="p">,</span> <span class="n">seed</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"randn"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="round"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.round.html#pyspark.sql.functions.round">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">round</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">scale</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Round the given value to `scale` decimal places using HALF_UP rounding mode if `scale` >= 0</span> |
| <span class="sd"> or at integral part when `scale` < 0.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> input column to round.</span> |
| <span class="sd"> scale : int optional default 0</span> |
| <span class="sd"> scale value.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> rounded values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.createDataFrame([(2.5,)], ['a']).select(round('a', 0).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=3.0)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"round"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">scale</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="bround"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bround.html#pyspark.sql.functions.bround">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">bround</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">scale</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Round the given value to `scale` decimal places using HALF_EVEN rounding mode if `scale` >= 0</span> |
| <span class="sd"> or at integral part when `scale` < 0.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> input column to round.</span> |
| <span class="sd"> scale : int optional default 0</span> |
| <span class="sd"> scale value.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> rounded values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.createDataFrame([(2.5,)], ['a']).select(bround('a', 0).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=2.0)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"bround"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">scale</span><span class="p">)</span></div> |
| |
| |
| <span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">shiftLeft</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">numBits</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Shift the given value numBits left.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> .. deprecated:: 3.2.0</span> |
| <span class="sd"> Use :func:`shiftleft` instead.</span> |
| <span class="sd"> """</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">"Deprecated in 3.2, use shiftleft instead."</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">shiftleft</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">numBits</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="shiftleft"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.shiftleft.html#pyspark.sql.functions.shiftleft">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">shiftleft</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">numBits</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Shift the given value numBits left.</span> |
| |
| <span class="sd"> .. versionadded:: 3.2.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> input column of values to shift.</span> |
| <span class="sd"> numBits : int</span> |
| <span class="sd"> number of bits to shift.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> shifted value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.createDataFrame([(21,)], ['a']).select(shiftleft('a', 1).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=42)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"shiftleft"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">numBits</span><span class="p">)</span></div> |
| |
| |
| <span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">shiftRight</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">numBits</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""(Signed) shift the given value numBits right.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> .. deprecated:: 3.2.0</span> |
| <span class="sd"> Use :func:`shiftright` instead.</span> |
| <span class="sd"> """</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">"Deprecated in 3.2, use shiftright instead."</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">shiftright</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">numBits</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="shiftright"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.shiftright.html#pyspark.sql.functions.shiftright">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">shiftright</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">numBits</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""(Signed) shift the given value numBits right.</span> |
| |
| <span class="sd"> .. versionadded:: 3.2.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> input column of values to shift.</span> |
| <span class="sd"> numBits : int</span> |
| <span class="sd"> number of bits to shift.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> shifted values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.createDataFrame([(42,)], ['a']).select(shiftright('a', 1).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=21)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"shiftright"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">numBits</span><span class="p">)</span></div> |
| |
| |
| <span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">shiftRightUnsigned</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">numBits</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Unsigned shift the given value numBits right.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> .. deprecated:: 3.2.0</span> |
| <span class="sd"> Use :func:`shiftrightunsigned` instead.</span> |
| <span class="sd"> """</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">"Deprecated in 3.2, use shiftrightunsigned instead."</span><span class="p">,</span> <span class="ne">FutureWarning</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">shiftrightunsigned</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">numBits</span><span class="p">)</span> |
| |
| |
| <div class="viewcode-block" id="shiftrightunsigned"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.shiftrightunsigned.html#pyspark.sql.functions.shiftrightunsigned">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">shiftrightunsigned</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">numBits</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Unsigned shift the given value numBits right.</span> |
| |
| <span class="sd"> .. versionadded:: 3.2.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> input column of values to shift.</span> |
| <span class="sd"> numBits : int</span> |
| <span class="sd"> number of bits to shift.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> shifted value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(-42,)], ['a'])</span> |
| <span class="sd"> >>> df.select(shiftrightunsigned('a', 1).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=9223372036854775787)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"shiftrightunsigned"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">numBits</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="spark_partition_id"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.spark_partition_id.html#pyspark.sql.functions.spark_partition_id">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">spark_partition_id</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""A column for partition ID.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This is non deterministic because it depends on data partitioning and task scheduling.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> partition id the record belongs to.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(2)</span> |
| <span class="sd"> >>> df.repartition(1).select(spark_partition_id().alias("pid")).collect()</span> |
| <span class="sd"> [Row(pid=0), Row(pid=0)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"spark_partition_id"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="expr"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.expr.html#pyspark.sql.functions.expr">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">expr</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Parses the expression string into the column that it represents</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : str</span> |
| <span class="sd"> expression defined in string.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> column representing the expression.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([["Alice"], ["Bob"]], ["name"])</span> |
| <span class="sd"> >>> df.select("name", expr("length(name)")).show()</span> |
| <span class="sd"> +-----+------------+</span> |
| <span class="sd"> | name|length(name)|</span> |
| <span class="sd"> +-----+------------+</span> |
| <span class="sd"> |Alice| 5|</span> |
| <span class="sd"> | Bob| 3|</span> |
| <span class="sd"> +-----+------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"expr"</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span></div> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">struct</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">struct</span><span class="p">(</span><span class="n">__cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">"ColumnOrName_"</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"ColumnOrName_"</span><span class="p">,</span> <span class="o">...</span><span class="p">]])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <div class="viewcode-block" id="struct"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.struct.html#pyspark.sql.functions.struct">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">struct</span><span class="p">(</span> |
| <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">"ColumnOrName_"</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"ColumnOrName_"</span><span class="p">,</span> <span class="o">...</span><span class="p">]]]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Creates a new struct column.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : list, set, str or :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> column names or :class:`~pyspark.sql.Column`\\s to contain in the output struct.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a struct type column of given columns.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age"))</span> |
| <span class="sd"> >>> df.select(struct('age', 'name').alias("struct")).collect()</span> |
| <span class="sd"> [Row(struct=Row(age=2, name='Alice')), Row(struct=Row(age=5, name='Bob'))]</span> |
| <span class="sd"> >>> df.select(struct([df.age, df.name]).alias("struct")).collect()</span> |
| <span class="sd"> [Row(struct=Row(age=2, name='Alice')), Row(struct=Row(age=5, name='Bob'))]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">set</span><span class="p">)):</span> |
| <span class="n">cols</span> <span class="o">=</span> <span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="c1"># type: ignore[assignment]</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">"struct"</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span> <span class="c1"># type: ignore[arg-type]</span></div> |
| |
| |
| <div class="viewcode-block" id="named_struct"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.named_struct.html#pyspark.sql.functions.named_struct">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">named_struct</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Creates a struct with the given field names and values.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> list of columns to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, 2, 3)], ['a', 'b', 'c'])</span> |
| <span class="sd"> >>> df.select(named_struct(lit('x'), df.a, lit('y'), df.b).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=Row(x=1, y=2))]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">"named_struct"</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="greatest"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.greatest.html#pyspark.sql.functions.greatest">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">greatest</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the greatest value of the list of column names, skipping null values.</span> |
| <span class="sd"> This function takes at least 2 parameters. It will return null if all parameters are null.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> columns to check for gratest value.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> gratest value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, 4, 3)], ['a', 'b', 'c'])</span> |
| <span class="sd"> >>> df.select(greatest(df.a, df.b, df.c).alias("greatest")).collect()</span> |
| <span class="sd"> [Row(greatest=4)]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o"><</span> <span class="mi">2</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"WRONG_NUM_COLUMNS"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"func_name"</span><span class="p">:</span> <span class="s2">"greatest"</span><span class="p">,</span> <span class="s2">"num_cols"</span><span class="p">:</span> <span class="s2">"2"</span><span class="p">},</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">"greatest"</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="least"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.least.html#pyspark.sql.functions.least">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">least</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the least value of the list of column names, skipping null values.</span> |
| <span class="sd"> This function takes at least 2 parameters. It will return null if all parameters are null.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column names or columns to be compared</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> least value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, 4, 3)], ['a', 'b', 'c'])</span> |
| <span class="sd"> >>> df.select(least(df.a, df.b, df.c).alias("least")).collect()</span> |
| <span class="sd"> [Row(least=1)]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o"><</span> <span class="mi">2</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"WRONG_NUM_COLUMNS"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"func_name"</span><span class="p">:</span> <span class="s2">"least"</span><span class="p">,</span> <span class="s2">"num_cols"</span><span class="p">:</span> <span class="s2">"2"</span><span class="p">},</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">"least"</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="when"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.when.html#pyspark.sql.functions.when">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">when</span><span class="p">(</span><span class="n">condition</span><span class="p">:</span> <span class="n">Column</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Evaluates a list of conditions and returns one of multiple possible result expressions.</span> |
| <span class="sd"> If :func:`pyspark.sql.Column.otherwise` is not invoked, None is returned for unmatched</span> |
| <span class="sd"> conditions.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> condition : :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a boolean :class:`~pyspark.sql.Column` expression.</span> |
| <span class="sd"> value :</span> |
| <span class="sd"> a literal value, or a :class:`~pyspark.sql.Column` expression.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> column representing when expression.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(3)</span> |
| <span class="sd"> >>> df.select(when(df['id'] == 2, 3).otherwise(4).alias("age")).show()</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> |age|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | 4|</span> |
| <span class="sd"> | 4|</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> +---+</span> |
| |
| <span class="sd"> >>> df.select(when(df.id == 2, df.id + 1).alias("age")).show()</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> | age|</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> |NULL|</span> |
| <span class="sd"> |NULL|</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> """</span> |
| <span class="c1"># Explicitly not using ColumnOrName type here to make reading condition less opaque</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">condition</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_COLUMN"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"arg_name"</span><span class="p">:</span> <span class="s2">"condition"</span><span class="p">,</span> <span class="s2">"arg_type"</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">condition</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span> |
| <span class="p">)</span> |
| <span class="n">v</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">_jc</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> <span class="k">else</span> <span class="n">value</span> |
| |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"when"</span><span class="p">,</span> <span class="n">condition</span><span class="o">.</span><span class="n">_jc</span><span class="p">,</span> <span class="n">v</span><span class="p">)</span></div> |
| |
| |
| <span class="nd">@overload</span> <span class="c1"># type: ignore[no-redef]</span> |
| <span class="k">def</span> <span class="nf">log</span><span class="p">(</span><span class="n">arg1</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">log</span><span class="p">(</span><span class="n">arg1</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> <span class="n">arg2</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <div class="viewcode-block" id="log"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.log.html#pyspark.sql.functions.log">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">log</span><span class="p">(</span><span class="n">arg1</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">float</span><span class="p">],</span> <span class="n">arg2</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the first argument-based logarithm of the second argument.</span> |
| |
| <span class="sd"> If there is only one argument, then this takes the natural logarithm of the argument.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> arg1 : :class:`~pyspark.sql.Column`, str or float</span> |
| <span class="sd"> base number or actual number (in this case base is `e`)</span> |
| <span class="sd"> arg2 : :class:`~pyspark.sql.Column`, str or float</span> |
| <span class="sd"> number to calculate logariphm for.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> logariphm of given value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import functions as sf</span> |
| <span class="sd"> >>> df = spark.sql("SELECT * FROM VALUES (1), (2), (4) AS t(value)")</span> |
| <span class="sd"> >>> df.select(sf.log(2.0, df.value).alias('log2_value')).show()</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> |log2_value|</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> | 0.0|</span> |
| <span class="sd"> | 1.0|</span> |
| <span class="sd"> | 2.0|</span> |
| <span class="sd"> +----------+</span> |
| |
| <span class="sd"> And Natural logarithm</span> |
| |
| <span class="sd"> >>> df.select(sf.log(df.value).alias('ln_value')).show()</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> | ln_value|</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> | 0.0|</span> |
| <span class="sd"> |0.6931471805599453|</span> |
| <span class="sd"> |1.3862943611198906|</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">arg2</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"log"</span><span class="p">,</span> <span class="n">cast</span><span class="p">(</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">arg1</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"log"</span><span class="p">,</span> <span class="n">arg1</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">arg2</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="ln"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.ln.html#pyspark.sql.functions.ln">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">ln</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the natural logarithm of the argument.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> a column to calculate logariphm for.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> natural logarithm of given value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(4,)], ['a'])</span> |
| <span class="sd"> >>> df.select(ln('a')).show()</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> | ln(a)|</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> |1.3862943611198906|</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"ln"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="log2"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.log2.html#pyspark.sql.functions.log2">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">log2</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the base-2 logarithm of the argument.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> a column to calculate logariphm for.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> logariphm of given value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(4,)], ['a'])</span> |
| <span class="sd"> >>> df.select(log2('a').alias('log2')).show()</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> |log2|</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> | 2.0|</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"log2"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="conv"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.conv.html#pyspark.sql.functions.conv">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">conv</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">fromBase</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">toBase</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Convert a number in a string column from one base to another.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> a column to convert base for.</span> |
| <span class="sd"> fromBase: int</span> |
| <span class="sd"> from base number.</span> |
| <span class="sd"> toBase: int</span> |
| <span class="sd"> to base number.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> logariphm of given value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("010101",)], ['n'])</span> |
| <span class="sd"> >>> df.select(conv(df.n, 2, 16).alias('hex')).collect()</span> |
| <span class="sd"> [Row(hex='15')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"conv"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">fromBase</span><span class="p">,</span> <span class="n">toBase</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="factorial"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.factorial.html#pyspark.sql.functions.factorial">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">factorial</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes the factorial of the given value.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> a column to calculate factorial for.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> factorial of given value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(5,)], ['n'])</span> |
| <span class="sd"> >>> df.select(factorial(df.n).alias('f')).collect()</span> |
| <span class="sd"> [Row(f=120)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"factorial"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <span class="c1"># --------------- Window functions ------------------------</span> |
| |
| |
| <div class="viewcode-block" id="lag"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.lag.html#pyspark.sql.functions.lag">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">lag</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">offset</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">default</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Window function: returns the value that is `offset` rows before the current row, and</span> |
| <span class="sd"> `default` if there is less than `offset` rows before the current row. For example,</span> |
| <span class="sd"> an `offset` of one will return the previous row at any given point in the window partition.</span> |
| |
| <span class="sd"> This is equivalent to the LAG function in SQL.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| <span class="sd"> offset : int, optional default 1</span> |
| <span class="sd"> number of row to extend</span> |
| <span class="sd"> default : optional</span> |
| <span class="sd"> default value</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> value before current row based on `offset`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Window</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("a", 1),</span> |
| <span class="sd"> ... ("a", 2),</span> |
| <span class="sd"> ... ("a", 3),</span> |
| <span class="sd"> ... ("b", 8),</span> |
| <span class="sd"> ... ("b", 2)], ["c1", "c2"])</span> |
| <span class="sd"> >>> df.show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | c1| c2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | a| 1|</span> |
| <span class="sd"> | a| 2|</span> |
| <span class="sd"> | a| 3|</span> |
| <span class="sd"> | b| 8|</span> |
| <span class="sd"> | b| 2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> >>> w = Window.partitionBy("c1").orderBy("c2")</span> |
| <span class="sd"> >>> df.withColumn("previos_value", lag("c2").over(w)).show()</span> |
| <span class="sd"> +---+---+-------------+</span> |
| <span class="sd"> | c1| c2|previos_value|</span> |
| <span class="sd"> +---+---+-------------+</span> |
| <span class="sd"> | a| 1| NULL|</span> |
| <span class="sd"> | a| 2| 1|</span> |
| <span class="sd"> | a| 3| 2|</span> |
| <span class="sd"> | b| 2| NULL|</span> |
| <span class="sd"> | b| 8| 2|</span> |
| <span class="sd"> +---+---+-------------+</span> |
| <span class="sd"> >>> df.withColumn("previos_value", lag("c2", 1, 0).over(w)).show()</span> |
| <span class="sd"> +---+---+-------------+</span> |
| <span class="sd"> | c1| c2|previos_value|</span> |
| <span class="sd"> +---+---+-------------+</span> |
| <span class="sd"> | a| 1| 0|</span> |
| <span class="sd"> | a| 2| 1|</span> |
| <span class="sd"> | a| 3| 2|</span> |
| <span class="sd"> | b| 2| 0|</span> |
| <span class="sd"> | b| 8| 2|</span> |
| <span class="sd"> +---+---+-------------+</span> |
| <span class="sd"> >>> df.withColumn("previos_value", lag("c2", 2, -1).over(w)).show()</span> |
| <span class="sd"> +---+---+-------------+</span> |
| <span class="sd"> | c1| c2|previos_value|</span> |
| <span class="sd"> +---+---+-------------+</span> |
| <span class="sd"> | a| 1| -1|</span> |
| <span class="sd"> | a| 2| -1|</span> |
| <span class="sd"> | a| 3| 1|</span> |
| <span class="sd"> | b| 2| -1|</span> |
| <span class="sd"> | b| 8| -1|</span> |
| <span class="sd"> +---+---+-------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"lag"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">offset</span><span class="p">,</span> <span class="n">default</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="lead"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.lead.html#pyspark.sql.functions.lead">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">lead</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">offset</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">default</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Window function: returns the value that is `offset` rows after the current row, and</span> |
| <span class="sd"> `default` if there is less than `offset` rows after the current row. For example,</span> |
| <span class="sd"> an `offset` of one will return the next row at any given point in the window partition.</span> |
| |
| <span class="sd"> This is equivalent to the LEAD function in SQL.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| <span class="sd"> offset : int, optional default 1</span> |
| <span class="sd"> number of row to extend</span> |
| <span class="sd"> default : optional</span> |
| <span class="sd"> default value</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> value after current row based on `offset`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Window</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("a", 1),</span> |
| <span class="sd"> ... ("a", 2),</span> |
| <span class="sd"> ... ("a", 3),</span> |
| <span class="sd"> ... ("b", 8),</span> |
| <span class="sd"> ... ("b", 2)], ["c1", "c2"])</span> |
| <span class="sd"> >>> df.show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | c1| c2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | a| 1|</span> |
| <span class="sd"> | a| 2|</span> |
| <span class="sd"> | a| 3|</span> |
| <span class="sd"> | b| 8|</span> |
| <span class="sd"> | b| 2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> >>> w = Window.partitionBy("c1").orderBy("c2")</span> |
| <span class="sd"> >>> df.withColumn("next_value", lead("c2").over(w)).show()</span> |
| <span class="sd"> +---+---+----------+</span> |
| <span class="sd"> | c1| c2|next_value|</span> |
| <span class="sd"> +---+---+----------+</span> |
| <span class="sd"> | a| 1| 2|</span> |
| <span class="sd"> | a| 2| 3|</span> |
| <span class="sd"> | a| 3| NULL|</span> |
| <span class="sd"> | b| 2| 8|</span> |
| <span class="sd"> | b| 8| NULL|</span> |
| <span class="sd"> +---+---+----------+</span> |
| <span class="sd"> >>> df.withColumn("next_value", lead("c2", 1, 0).over(w)).show()</span> |
| <span class="sd"> +---+---+----------+</span> |
| <span class="sd"> | c1| c2|next_value|</span> |
| <span class="sd"> +---+---+----------+</span> |
| <span class="sd"> | a| 1| 2|</span> |
| <span class="sd"> | a| 2| 3|</span> |
| <span class="sd"> | a| 3| 0|</span> |
| <span class="sd"> | b| 2| 8|</span> |
| <span class="sd"> | b| 8| 0|</span> |
| <span class="sd"> +---+---+----------+</span> |
| <span class="sd"> >>> df.withColumn("next_value", lead("c2", 2, -1).over(w)).show()</span> |
| <span class="sd"> +---+---+----------+</span> |
| <span class="sd"> | c1| c2|next_value|</span> |
| <span class="sd"> +---+---+----------+</span> |
| <span class="sd"> | a| 1| 3|</span> |
| <span class="sd"> | a| 2| -1|</span> |
| <span class="sd"> | a| 3| -1|</span> |
| <span class="sd"> | b| 2| -1|</span> |
| <span class="sd"> | b| 8| -1|</span> |
| <span class="sd"> +---+---+----------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"lead"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">offset</span><span class="p">,</span> <span class="n">default</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="nth_value"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.nth_value.html#pyspark.sql.functions.nth_value">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">nth_value</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">offset</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">ignoreNulls</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Window function: returns the value that is the `offset`\\th row of the window frame</span> |
| <span class="sd"> (counting from 1), and `null` if the size of window frame is less than `offset` rows.</span> |
| |
| <span class="sd"> It will return the `offset`\\th non-null value it sees when `ignoreNulls` is set to</span> |
| <span class="sd"> true. If all values are null, then null is returned.</span> |
| |
| <span class="sd"> This is equivalent to the nth_value function in SQL.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| <span class="sd"> offset : int</span> |
| <span class="sd"> number of row to use as the value</span> |
| <span class="sd"> ignoreNulls : bool, optional</span> |
| <span class="sd"> indicates the Nth value should skip null in the</span> |
| <span class="sd"> determination of which row to use</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> value of nth row.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Window</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("a", 1),</span> |
| <span class="sd"> ... ("a", 2),</span> |
| <span class="sd"> ... ("a", 3),</span> |
| <span class="sd"> ... ("b", 8),</span> |
| <span class="sd"> ... ("b", 2)], ["c1", "c2"])</span> |
| <span class="sd"> >>> df.show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | c1| c2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | a| 1|</span> |
| <span class="sd"> | a| 2|</span> |
| <span class="sd"> | a| 3|</span> |
| <span class="sd"> | b| 8|</span> |
| <span class="sd"> | b| 2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> >>> w = Window.partitionBy("c1").orderBy("c2")</span> |
| <span class="sd"> >>> df.withColumn("nth_value", nth_value("c2", 1).over(w)).show()</span> |
| <span class="sd"> +---+---+---------+</span> |
| <span class="sd"> | c1| c2|nth_value|</span> |
| <span class="sd"> +---+---+---------+</span> |
| <span class="sd"> | a| 1| 1|</span> |
| <span class="sd"> | a| 2| 1|</span> |
| <span class="sd"> | a| 3| 1|</span> |
| <span class="sd"> | b| 2| 2|</span> |
| <span class="sd"> | b| 8| 2|</span> |
| <span class="sd"> +---+---+---------+</span> |
| <span class="sd"> >>> df.withColumn("nth_value", nth_value("c2", 2).over(w)).show()</span> |
| <span class="sd"> +---+---+---------+</span> |
| <span class="sd"> | c1| c2|nth_value|</span> |
| <span class="sd"> +---+---+---------+</span> |
| <span class="sd"> | a| 1| NULL|</span> |
| <span class="sd"> | a| 2| 2|</span> |
| <span class="sd"> | a| 3| 2|</span> |
| <span class="sd"> | b| 2| NULL|</span> |
| <span class="sd"> | b| 8| 8|</span> |
| <span class="sd"> +---+---+---------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"nth_value"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">offset</span><span class="p">,</span> <span class="n">ignoreNulls</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="any_value"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.any_value.html#pyspark.sql.functions.any_value">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">any_value</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">ignoreNulls</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">bool</span><span class="p">,</span> <span class="n">Column</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns some value of `col` for a group of rows.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> ignorenulls : :class:`~pyspark.sql.Column` or bool</span> |
| <span class="sd"> if first value is null then look for first non-null value.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> some value of `col` for a group of rows.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(None, 1),</span> |
| <span class="sd"> ... ("a", 2),</span> |
| <span class="sd"> ... ("a", 3),</span> |
| <span class="sd"> ... ("b", 8),</span> |
| <span class="sd"> ... ("b", 2)], ["c1", "c2"])</span> |
| <span class="sd"> >>> df.select(any_value('c1'), any_value('c2')).collect()</span> |
| <span class="sd"> [Row(any_value(c1)=None, any_value(c2)=1)]</span> |
| <span class="sd"> >>> df.select(any_value('c1', True), any_value('c2', True)).collect()</span> |
| <span class="sd"> [Row(any_value(c1)='a', any_value(c2)=1)]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">ignoreNulls</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"any_value"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">ignoreNulls</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">ignoreNulls</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ignoreNulls</span><span class="p">,</span> <span class="nb">bool</span><span class="p">)</span> <span class="k">else</span> <span class="n">ignoreNulls</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"any_value"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">ignoreNulls</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="first_value"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.first_value.html#pyspark.sql.functions.first_value">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">first_value</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">ignoreNulls</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">bool</span><span class="p">,</span> <span class="n">Column</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the first value of `col` for a group of rows. It will return the first non-null</span> |
| <span class="sd"> value it sees when `ignoreNulls` is set to true. If all values are null, then null is returned.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> ignorenulls : :class:`~pyspark.sql.Column` or bool</span> |
| <span class="sd"> if first value is null then look for first non-null value.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> some value of `col` for a group of rows.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [(None, 1), ("a", 2), ("a", 3), ("b", 8), ("b", 2)], ["a", "b"]</span> |
| <span class="sd"> ... ).select(sf.first_value('a'), sf.first_value('b')).show()</span> |
| <span class="sd"> +--------------+--------------+</span> |
| <span class="sd"> |first_value(a)|first_value(b)|</span> |
| <span class="sd"> +--------------+--------------+</span> |
| <span class="sd"> | NULL| 1|</span> |
| <span class="sd"> +--------------+--------------+</span> |
| |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [(None, 1), ("a", 2), ("a", 3), ("b", 8), ("b", 2)], ["a", "b"]</span> |
| <span class="sd"> ... ).select(sf.first_value('a', True), sf.first_value('b', True)).show()</span> |
| <span class="sd"> +--------------+--------------+</span> |
| <span class="sd"> |first_value(a)|first_value(b)|</span> |
| <span class="sd"> +--------------+--------------+</span> |
| <span class="sd"> | a| 1|</span> |
| <span class="sd"> +--------------+--------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">ignoreNulls</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"first_value"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">ignoreNulls</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">ignoreNulls</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ignoreNulls</span><span class="p">,</span> <span class="nb">bool</span><span class="p">)</span> <span class="k">else</span> <span class="n">ignoreNulls</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"first_value"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">ignoreNulls</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="last_value"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.last_value.html#pyspark.sql.functions.last_value">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">last_value</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">ignoreNulls</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">bool</span><span class="p">,</span> <span class="n">Column</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the last value of `col` for a group of rows. It will return the last non-null</span> |
| <span class="sd"> value it sees when `ignoreNulls` is set to true. If all values are null, then null is returned.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> ignorenulls : :class:`~pyspark.sql.Column` or bool</span> |
| <span class="sd"> if first value is null then look for first non-null value.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> some value of `col` for a group of rows.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [("a", 1), ("a", 2), ("a", 3), ("b", 8), (None, 2)], ["a", "b"]</span> |
| <span class="sd"> ... ).select(sf.last_value('a'), sf.last_value('b')).show()</span> |
| <span class="sd"> +-------------+-------------+</span> |
| <span class="sd"> |last_value(a)|last_value(b)|</span> |
| <span class="sd"> +-------------+-------------+</span> |
| <span class="sd"> | NULL| 2|</span> |
| <span class="sd"> +-------------+-------------+</span> |
| |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [("a", 1), ("a", 2), ("a", 3), ("b", 8), (None, 2)], ["a", "b"]</span> |
| <span class="sd"> ... ).select(sf.last_value('a', True), sf.last_value('b', True)).show()</span> |
| <span class="sd"> +-------------+-------------+</span> |
| <span class="sd"> |last_value(a)|last_value(b)|</span> |
| <span class="sd"> +-------------+-------------+</span> |
| <span class="sd"> | b| 2|</span> |
| <span class="sd"> +-------------+-------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">ignoreNulls</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"last_value"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">ignoreNulls</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">ignoreNulls</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ignoreNulls</span><span class="p">,</span> <span class="nb">bool</span><span class="p">)</span> <span class="k">else</span> <span class="n">ignoreNulls</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"last_value"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">ignoreNulls</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="count_if"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.count_if.html#pyspark.sql.functions.count_if">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">count_if</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the number of `TRUE` values for the `col`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the number of `TRUE` values for the `col`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("a", 1),</span> |
| <span class="sd"> ... ("a", 2),</span> |
| <span class="sd"> ... ("a", 3),</span> |
| <span class="sd"> ... ("b", 8),</span> |
| <span class="sd"> ... ("b", 2)], ["c1", "c2"])</span> |
| <span class="sd"> >>> df.select(count_if(col('c2') % 2 == 0)).show()</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> |count_if(((c2 % 2) = 0))|</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"count_if"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="histogram_numeric"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.histogram_numeric.html#pyspark.sql.functions.histogram_numeric">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">histogram_numeric</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">nBins</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Computes a histogram on numeric 'col' using nb bins.</span> |
| <span class="sd"> The return value is an array of (x,y) pairs representing the centers of the</span> |
| <span class="sd"> histogram's bins. As the value of 'nb' is increased, the histogram approximation</span> |
| <span class="sd"> gets finer-grained, but may yield artifacts around outliers. In practice, 20-40</span> |
| <span class="sd"> histogram bins appear to work well, with more bins being required for skewed or</span> |
| <span class="sd"> smaller datasets. Note that this function creates a histogram with non-uniform</span> |
| <span class="sd"> bin widths. It offers no guarantees in terms of the mean-squared-error of the</span> |
| <span class="sd"> histogram, but in practice is comparable to the histograms produced by the R/S-Plus</span> |
| <span class="sd"> statistical computing packages. Note: the output type of the 'x' field in the return value is</span> |
| <span class="sd"> propagated from the input value consumed in the aggregate function.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> nBins : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> number of Histogram columns.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a histogram on numeric 'col' using nb bins.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("a", 1),</span> |
| <span class="sd"> ... ("a", 2),</span> |
| <span class="sd"> ... ("a", 3),</span> |
| <span class="sd"> ... ("b", 8),</span> |
| <span class="sd"> ... ("b", 2)], ["c1", "c2"])</span> |
| <span class="sd"> >>> df.select(histogram_numeric('c2', lit(5))).show()</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> |histogram_numeric(c2, 5)|</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> | [{1, 1.0}, {2, 1....|</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"histogram_numeric"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">nBins</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="ntile"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.ntile.html#pyspark.sql.functions.ntile">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">ntile</span><span class="p">(</span><span class="n">n</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Window function: returns the ntile group id (from 1 to `n` inclusive)</span> |
| <span class="sd"> in an ordered window partition. For example, if `n` is 4, the first</span> |
| <span class="sd"> quarter of the rows will get value 1, the second quarter will get 2,</span> |
| <span class="sd"> the third quarter will get 3, and the last quarter will get 4.</span> |
| |
| <span class="sd"> This is equivalent to the NTILE function in SQL.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> n : int</span> |
| <span class="sd"> an integer</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> portioned group id.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Window</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("a", 1),</span> |
| <span class="sd"> ... ("a", 2),</span> |
| <span class="sd"> ... ("a", 3),</span> |
| <span class="sd"> ... ("b", 8),</span> |
| <span class="sd"> ... ("b", 2)], ["c1", "c2"])</span> |
| <span class="sd"> >>> df.show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | c1| c2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | a| 1|</span> |
| <span class="sd"> | a| 2|</span> |
| <span class="sd"> | a| 3|</span> |
| <span class="sd"> | b| 8|</span> |
| <span class="sd"> | b| 2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> >>> w = Window.partitionBy("c1").orderBy("c2")</span> |
| <span class="sd"> >>> df.withColumn("ntile", ntile(2).over(w)).show()</span> |
| <span class="sd"> +---+---+-----+</span> |
| <span class="sd"> | c1| c2|ntile|</span> |
| <span class="sd"> +---+---+-----+</span> |
| <span class="sd"> | a| 1| 1|</span> |
| <span class="sd"> | a| 2| 1|</span> |
| <span class="sd"> | a| 3| 2|</span> |
| <span class="sd"> | b| 2| 1|</span> |
| <span class="sd"> | b| 8| 2|</span> |
| <span class="sd"> +---+---+-----+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"ntile"</span><span class="p">,</span> <span class="nb">int</span><span class="p">(</span><span class="n">n</span><span class="p">))</span></div> |
| |
| |
| <span class="c1"># ---------------------- Date/Timestamp functions ------------------------------</span> |
| |
| |
| <div class="viewcode-block" id="curdate"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.curdate.html#pyspark.sql.functions.curdate">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">curdate</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the current date at the start of query evaluation as a :class:`DateType` column.</span> |
| <span class="sd"> All calls of current_date within the same query return the same value.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> current date.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.range(1).select(sf.curdate()).show() # doctest: +SKIP</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> |current_date()|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> | 2022-08-26|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"curdate"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="current_date"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.current_date.html#pyspark.sql.functions.current_date">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">current_date</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the current date at the start of query evaluation as a :class:`DateType` column.</span> |
| <span class="sd"> All calls of current_date within the same query return the same value.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> current date.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(current_date()).show() # doctest: +SKIP</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> |current_date()|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> | 2022-08-26|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"current_date"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="current_timezone"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.current_timezone.html#pyspark.sql.functions.current_timezone">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">current_timezone</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the current session local timezone.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> current session local timezone.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")</span> |
| <span class="sd"> >>> spark.range(1).select(current_timezone()).show()</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> | current_timezone()|</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> |America/Los_Angeles|</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> >>> spark.conf.unset("spark.sql.session.timeZone")</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"current_timezone"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="current_timestamp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.current_timestamp.html#pyspark.sql.functions.current_timestamp">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">current_timestamp</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the current timestamp at the start of query evaluation as a :class:`TimestampType`</span> |
| <span class="sd"> column. All calls of current_timestamp within the same query return the same value.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> current date and time.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(current_timestamp()).show(truncate=False) # doctest: +SKIP</span> |
| <span class="sd"> +-----------------------+</span> |
| <span class="sd"> |current_timestamp() |</span> |
| <span class="sd"> +-----------------------+</span> |
| <span class="sd"> |2022-08-26 21:23:22.716|</span> |
| <span class="sd"> +-----------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"current_timestamp"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="now"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.now.html#pyspark.sql.functions.now">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">now</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the current timestamp at the start of query evaluation.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> current timestamp at the start of query evaluation.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(now()).show(truncate=False) # doctest: +SKIP</span> |
| <span class="sd"> +-----------------------+</span> |
| <span class="sd"> |now() |</span> |
| <span class="sd"> +-----------------------+</span> |
| <span class="sd"> |2022-08-26 21:23:22.716|</span> |
| <span class="sd"> +-----------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"current_timestamp"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="localtimestamp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.localtimestamp.html#pyspark.sql.functions.localtimestamp">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">localtimestamp</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the current timestamp without time zone at the start of query evaluation</span> |
| <span class="sd"> as a timestamp without time zone column. All calls of localtimestamp within the</span> |
| <span class="sd"> same query return the same value.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> current local date and time.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(localtimestamp()).show(truncate=False) # doctest: +SKIP</span> |
| <span class="sd"> +-----------------------+</span> |
| <span class="sd"> |localtimestamp() |</span> |
| <span class="sd"> +-----------------------+</span> |
| <span class="sd"> |2022-08-26 21:28:34.639|</span> |
| <span class="sd"> +-----------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"localtimestamp"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="date_format"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.date_format.html#pyspark.sql.functions.date_format">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">date_format</span><span class="p">(</span><span class="n">date</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Converts a date/timestamp/string to a value of string in the format specified by the date</span> |
| <span class="sd"> format given by the second argument.</span> |
| |
| <span class="sd"> A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All</span> |
| <span class="sd"> pattern letters of `datetime pattern`_. can be used.</span> |
| |
| <span class="sd"> .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Whenever possible, use specialized functions like `year`.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> date : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> input column of values to format.</span> |
| <span class="sd"> format: str</span> |
| <span class="sd"> format to use to represent datetime values.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> string value representing formatted datetime.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])</span> |
| <span class="sd"> >>> df.select(date_format('dt', 'MM/dd/yyy').alias('date')).collect()</span> |
| <span class="sd"> [Row(date='04/08/2015')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"date_format"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">date</span><span class="p">),</span> <span class="nb">format</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="year"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.year.html#pyspark.sql.functions.year">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">year</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Extract the year of a given date/timestamp as integer.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target date/timestamp column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> year part of the date/timestamp as integer.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])</span> |
| <span class="sd"> >>> df.select(year('dt').alias('year')).collect()</span> |
| <span class="sd"> [Row(year=2015)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"year"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="quarter"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.quarter.html#pyspark.sql.functions.quarter">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">quarter</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Extract the quarter of a given date/timestamp as integer.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target date/timestamp column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> quarter of the date/timestamp as integer.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])</span> |
| <span class="sd"> >>> df.select(quarter('dt').alias('quarter')).collect()</span> |
| <span class="sd"> [Row(quarter=2)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"quarter"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="month"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.month.html#pyspark.sql.functions.month">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">month</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Extract the month of a given date/timestamp as integer.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target date/timestamp column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> month part of the date/timestamp as integer.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])</span> |
| <span class="sd"> >>> df.select(month('dt').alias('month')).collect()</span> |
| <span class="sd"> [Row(month=4)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"month"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="dayofweek"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.dayofweek.html#pyspark.sql.functions.dayofweek">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">dayofweek</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Extract the day of the week of a given date/timestamp as integer.</span> |
| <span class="sd"> Ranges from 1 for a Sunday through to 7 for a Saturday</span> |
| |
| <span class="sd"> .. versionadded:: 2.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target date/timestamp column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> day of the week for given date/timestamp as integer.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])</span> |
| <span class="sd"> >>> df.select(dayofweek('dt').alias('day')).collect()</span> |
| <span class="sd"> [Row(day=4)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"dayofweek"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="dayofmonth"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.dayofmonth.html#pyspark.sql.functions.dayofmonth">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">dayofmonth</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Extract the day of the month of a given date/timestamp as integer.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target date/timestamp column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> day of the month for given date/timestamp as integer.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])</span> |
| <span class="sd"> >>> df.select(dayofmonth('dt').alias('day')).collect()</span> |
| <span class="sd"> [Row(day=8)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"dayofmonth"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="day"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.day.html#pyspark.sql.functions.day">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">day</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Extract the day of the month of a given date/timestamp as integer.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target date/timestamp column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> day of the month for given date/timestamp as integer.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])</span> |
| <span class="sd"> >>> df.select(day('dt').alias('day')).collect()</span> |
| <span class="sd"> [Row(day=8)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"day"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="dayofyear"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.dayofyear.html#pyspark.sql.functions.dayofyear">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">dayofyear</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Extract the day of the year of a given date/timestamp as integer.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target date/timestamp column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> day of the year for given date/timestamp as integer.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])</span> |
| <span class="sd"> >>> df.select(dayofyear('dt').alias('day')).collect()</span> |
| <span class="sd"> [Row(day=98)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"dayofyear"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="hour"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.hour.html#pyspark.sql.functions.hour">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">hour</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Extract the hours of a given timestamp as integer.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target date/timestamp column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> hour part of the timestamp as integer.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import datetime</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts'])</span> |
| <span class="sd"> >>> df.select(hour('ts').alias('hour')).collect()</span> |
| <span class="sd"> [Row(hour=13)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"hour"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="minute"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.minute.html#pyspark.sql.functions.minute">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">minute</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Extract the minutes of a given timestamp as integer.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target date/timestamp column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> minutes part of the timestamp as integer.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import datetime</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts'])</span> |
| <span class="sd"> >>> df.select(minute('ts').alias('minute')).collect()</span> |
| <span class="sd"> [Row(minute=8)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"minute"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="second"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.second.html#pyspark.sql.functions.second">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">second</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Extract the seconds of a given date as integer.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target date/timestamp column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> `seconds` part of the timestamp as integer.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import datetime</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts'])</span> |
| <span class="sd"> >>> df.select(second('ts').alias('second')).collect()</span> |
| <span class="sd"> [Row(second=15)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"second"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="weekofyear"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.weekofyear.html#pyspark.sql.functions.weekofyear">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">weekofyear</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Extract the week number of a given date as integer.</span> |
| <span class="sd"> A week is considered to start on a Monday and week 1 is the first week with more than 3 days,</span> |
| <span class="sd"> as defined by ISO 8601</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target timestamp column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> `week` of the year for given date as integer.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])</span> |
| <span class="sd"> >>> df.select(weekofyear(df.dt).alias('week')).collect()</span> |
| <span class="sd"> [Row(week=15)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"weekofyear"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="weekday"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.weekday.html#pyspark.sql.functions.weekday">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">weekday</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the day of the week for date/timestamp (0 = Monday, 1 = Tuesday, ..., 6 = Sunday).</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target date/timestamp column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the day of the week for date/timestamp (0 = Monday, 1 = Tuesday, ..., 6 = Sunday).</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])</span> |
| <span class="sd"> >>> df.select(weekday('dt').alias('day')).show()</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> |day|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"weekday"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="extract"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.extract.html#pyspark.sql.functions.extract">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">extract</span><span class="p">(</span><span class="n">field</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">source</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Extracts a part of the date/timestamp or interval source.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> field : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> selects which part of the source should be extracted.</span> |
| <span class="sd"> source : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> a date/timestamp or interval column from where `field` should be extracted.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a part of the date/timestamp or interval source.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import datetime</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts'])</span> |
| <span class="sd"> >>> df.select(</span> |
| <span class="sd"> ... extract(lit('YEAR'), 'ts').alias('year'),</span> |
| <span class="sd"> ... extract(lit('month'), 'ts').alias('month'),</span> |
| <span class="sd"> ... extract(lit('WEEK'), 'ts').alias('week'),</span> |
| <span class="sd"> ... extract(lit('D'), 'ts').alias('day'),</span> |
| <span class="sd"> ... extract(lit('M'), 'ts').alias('minute'),</span> |
| <span class="sd"> ... extract(lit('S'), 'ts').alias('second')</span> |
| <span class="sd"> ... ).collect()</span> |
| <span class="sd"> [Row(year=2015, month=4, week=15, day=8, minute=8, second=Decimal('15.000000'))]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"extract"</span><span class="p">,</span> <span class="n">field</span><span class="p">,</span> <span class="n">source</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="date_part"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.date_part.html#pyspark.sql.functions.date_part">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">date_part</span><span class="p">(</span><span class="n">field</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">source</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Extracts a part of the date/timestamp or interval source.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> field : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> selects which part of the source should be extracted, and supported string values</span> |
| <span class="sd"> are as same as the fields of the equivalent function `extract`.</span> |
| <span class="sd"> source : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> a date/timestamp or interval column from where `field` should be extracted.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a part of the date/timestamp or interval source.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import datetime</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts'])</span> |
| <span class="sd"> >>> df.select(</span> |
| <span class="sd"> ... date_part(lit('YEAR'), 'ts').alias('year'),</span> |
| <span class="sd"> ... date_part(lit('month'), 'ts').alias('month'),</span> |
| <span class="sd"> ... date_part(lit('WEEK'), 'ts').alias('week'),</span> |
| <span class="sd"> ... date_part(lit('D'), 'ts').alias('day'),</span> |
| <span class="sd"> ... date_part(lit('M'), 'ts').alias('minute'),</span> |
| <span class="sd"> ... date_part(lit('S'), 'ts').alias('second')</span> |
| <span class="sd"> ... ).collect()</span> |
| <span class="sd"> [Row(year=2015, month=4, week=15, day=8, minute=8, second=Decimal('15.000000'))]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"date_part"</span><span class="p">,</span> <span class="n">field</span><span class="p">,</span> <span class="n">source</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="datepart"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.datepart.html#pyspark.sql.functions.datepart">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">datepart</span><span class="p">(</span><span class="n">field</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">source</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Extracts a part of the date/timestamp or interval source.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> field : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> selects which part of the source should be extracted, and supported string values</span> |
| <span class="sd"> are as same as the fields of the equivalent function `extract`.</span> |
| <span class="sd"> source : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> a date/timestamp or interval column from where `field` should be extracted.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a part of the date/timestamp or interval source.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import datetime</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(datetime.datetime(2015, 4, 8, 13, 8, 15),)], ['ts'])</span> |
| <span class="sd"> >>> df.select(</span> |
| <span class="sd"> ... datepart(lit('YEAR'), 'ts').alias('year'),</span> |
| <span class="sd"> ... datepart(lit('month'), 'ts').alias('month'),</span> |
| <span class="sd"> ... datepart(lit('WEEK'), 'ts').alias('week'),</span> |
| <span class="sd"> ... datepart(lit('D'), 'ts').alias('day'),</span> |
| <span class="sd"> ... datepart(lit('M'), 'ts').alias('minute'),</span> |
| <span class="sd"> ... datepart(lit('S'), 'ts').alias('second')</span> |
| <span class="sd"> ... ).collect()</span> |
| <span class="sd"> [Row(year=2015, month=4, week=15, day=8, minute=8, second=Decimal('15.000000'))]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"datepart"</span><span class="p">,</span> <span class="n">field</span><span class="p">,</span> <span class="n">source</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="make_date"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.make_date.html#pyspark.sql.functions.make_date">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">make_date</span><span class="p">(</span><span class="n">year</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">month</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">day</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a column with a date built from the year, month and day columns.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> year : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The year to build the date</span> |
| <span class="sd"> month : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The month to build the date</span> |
| <span class="sd"> day : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The day to build the date</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a date built from given parts.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(2020, 6, 26)], ['Y', 'M', 'D'])</span> |
| <span class="sd"> >>> df.select(make_date(df.Y, df.M, df.D).alias("datefield")).collect()</span> |
| <span class="sd"> [Row(datefield=datetime.date(2020, 6, 26))]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"make_date"</span><span class="p">,</span> <span class="n">year</span><span class="p">,</span> <span class="n">month</span><span class="p">,</span> <span class="n">day</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="date_add"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.date_add.html#pyspark.sql.functions.date_add">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">date_add</span><span class="p">(</span><span class="n">start</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">days</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">int</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the date that is `days` days after `start`. If `days` is a negative value</span> |
| <span class="sd"> then these amount of days will be deducted from `start`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> start : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> date column to work on.</span> |
| <span class="sd"> days : :class:`~pyspark.sql.Column` or str or int</span> |
| <span class="sd"> how many days after the given date to calculate.</span> |
| <span class="sd"> Accepts negative value as well to calculate backwards in time.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a date after/before given number of days.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('2015-04-08', 2,)], ['dt', 'add'])</span> |
| <span class="sd"> >>> df.select(date_add(df.dt, 1).alias('next_date')).collect()</span> |
| <span class="sd"> [Row(next_date=datetime.date(2015, 4, 9))]</span> |
| <span class="sd"> >>> df.select(date_add(df.dt, df.add.cast('integer')).alias('next_date')).collect()</span> |
| <span class="sd"> [Row(next_date=datetime.date(2015, 4, 10))]</span> |
| <span class="sd"> >>> df.select(date_add('dt', -1).alias('prev_date')).collect()</span> |
| <span class="sd"> [Row(prev_date=datetime.date(2015, 4, 7))]</span> |
| <span class="sd"> """</span> |
| <span class="n">days</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">days</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">days</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">days</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"date_add"</span><span class="p">,</span> <span class="n">start</span><span class="p">,</span> <span class="n">days</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="dateadd"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.dateadd.html#pyspark.sql.functions.dateadd">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">dateadd</span><span class="p">(</span><span class="n">start</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">days</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">int</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the date that is `days` days after `start`. If `days` is a negative value</span> |
| <span class="sd"> then these amount of days will be deducted from `start`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> start : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> date column to work on.</span> |
| <span class="sd"> days : :class:`~pyspark.sql.Column` or str or int</span> |
| <span class="sd"> how many days after the given date to calculate.</span> |
| <span class="sd"> Accepts negative value as well to calculate backwards in time.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a date after/before given number of days.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [('2015-04-08', 2,)], ['dt', 'add']</span> |
| <span class="sd"> ... ).select(sf.dateadd("dt", 1)).show()</span> |
| <span class="sd"> +---------------+</span> |
| <span class="sd"> |date_add(dt, 1)|</span> |
| <span class="sd"> +---------------+</span> |
| <span class="sd"> | 2015-04-09|</span> |
| <span class="sd"> +---------------+</span> |
| |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [('2015-04-08', 2,)], ['dt', 'add']</span> |
| <span class="sd"> ... ).select(sf.dateadd("dt", sf.lit(2))).show()</span> |
| <span class="sd"> +---------------+</span> |
| <span class="sd"> |date_add(dt, 2)|</span> |
| <span class="sd"> +---------------+</span> |
| <span class="sd"> | 2015-04-10|</span> |
| <span class="sd"> +---------------+</span> |
| |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [('2015-04-08', 2,)], ['dt', 'add']</span> |
| <span class="sd"> ... ).select(sf.dateadd("dt", -1)).show()</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> |date_add(dt, -1)|</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> | 2015-04-07|</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> """</span> |
| <span class="n">days</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">days</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">days</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">days</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"dateadd"</span><span class="p">,</span> <span class="n">start</span><span class="p">,</span> <span class="n">days</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="date_sub"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.date_sub.html#pyspark.sql.functions.date_sub">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">date_sub</span><span class="p">(</span><span class="n">start</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">days</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">int</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the date that is `days` days before `start`. If `days` is a negative value</span> |
| <span class="sd"> then these amount of days will be added to `start`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> start : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> date column to work on.</span> |
| <span class="sd"> days : :class:`~pyspark.sql.Column` or str or int</span> |
| <span class="sd"> how many days before the given date to calculate.</span> |
| <span class="sd"> Accepts negative value as well to calculate forward in time.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a date before/after given number of days.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('2015-04-08', 2,)], ['dt', 'sub'])</span> |
| <span class="sd"> >>> df.select(date_sub(df.dt, 1).alias('prev_date')).collect()</span> |
| <span class="sd"> [Row(prev_date=datetime.date(2015, 4, 7))]</span> |
| <span class="sd"> >>> df.select(date_sub(df.dt, df.sub.cast('integer')).alias('prev_date')).collect()</span> |
| <span class="sd"> [Row(prev_date=datetime.date(2015, 4, 6))]</span> |
| <span class="sd"> >>> df.select(date_sub('dt', -1).alias('next_date')).collect()</span> |
| <span class="sd"> [Row(next_date=datetime.date(2015, 4, 9))]</span> |
| <span class="sd"> """</span> |
| <span class="n">days</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">days</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">days</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">days</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"date_sub"</span><span class="p">,</span> <span class="n">start</span><span class="p">,</span> <span class="n">days</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="datediff"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.datediff.html#pyspark.sql.functions.datediff">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">datediff</span><span class="p">(</span><span class="n">end</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">start</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the number of days from `start` to `end`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> end : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> to date column to work on.</span> |
| <span class="sd"> start : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> from date column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> difference in days between two dates.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('2015-04-08','2015-05-10')], ['d1', 'd2'])</span> |
| <span class="sd"> >>> df.select(datediff(df.d2, df.d1).alias('diff')).collect()</span> |
| <span class="sd"> [Row(diff=32)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"datediff"</span><span class="p">,</span> <span class="n">end</span><span class="p">,</span> <span class="n">start</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="date_diff"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.date_diff.html#pyspark.sql.functions.date_diff">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">date_diff</span><span class="p">(</span><span class="n">end</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">start</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the number of days from `start` to `end`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> end : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> to date column to work on.</span> |
| <span class="sd"> start : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> from date column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> difference in days between two dates.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('2015-04-08','2015-05-10')], ['d1', 'd2'])</span> |
| <span class="sd"> >>> df.select(date_diff(df.d2, df.d1).alias('diff')).collect()</span> |
| <span class="sd"> [Row(diff=32)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"date_diff"</span><span class="p">,</span> <span class="n">end</span><span class="p">,</span> <span class="n">start</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="date_from_unix_date"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.date_from_unix_date.html#pyspark.sql.functions.date_from_unix_date">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">date_from_unix_date</span><span class="p">(</span><span class="n">days</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Create date from the number of `days` since 1970-01-01.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> days : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the target column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the date from the number of days since 1970-01-01.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(date_from_unix_date(lit(1))).show()</span> |
| <span class="sd"> +----------------------+</span> |
| <span class="sd"> |date_from_unix_date(1)|</span> |
| <span class="sd"> +----------------------+</span> |
| <span class="sd"> | 1970-01-02|</span> |
| <span class="sd"> +----------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"date_from_unix_date"</span><span class="p">,</span> <span class="n">days</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="add_months"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.add_months.html#pyspark.sql.functions.add_months">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">add_months</span><span class="p">(</span><span class="n">start</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">months</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">int</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the date that is `months` months after `start`. If `months` is a negative value</span> |
| <span class="sd"> then these amount of months will be deducted from the `start`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> start : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> date column to work on.</span> |
| <span class="sd"> months : :class:`~pyspark.sql.Column` or str or int</span> |
| <span class="sd"> how many months after the given date to calculate.</span> |
| <span class="sd"> Accepts negative value as well to calculate backwards.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a date after/before given number of months.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('2015-04-08', 2)], ['dt', 'add'])</span> |
| <span class="sd"> >>> df.select(add_months(df.dt, 1).alias('next_month')).collect()</span> |
| <span class="sd"> [Row(next_month=datetime.date(2015, 5, 8))]</span> |
| <span class="sd"> >>> df.select(add_months(df.dt, df.add.cast('integer')).alias('next_month')).collect()</span> |
| <span class="sd"> [Row(next_month=datetime.date(2015, 6, 8))]</span> |
| <span class="sd"> >>> df.select(add_months('dt', -2).alias('prev_month')).collect()</span> |
| <span class="sd"> [Row(prev_month=datetime.date(2015, 2, 8))]</span> |
| <span class="sd"> """</span> |
| <span class="n">months</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">months</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">months</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">months</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"add_months"</span><span class="p">,</span> <span class="n">start</span><span class="p">,</span> <span class="n">months</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="months_between"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.months_between.html#pyspark.sql.functions.months_between">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">months_between</span><span class="p">(</span><span class="n">date1</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">date2</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">roundOff</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns number of months between dates date1 and date2.</span> |
| <span class="sd"> If date1 is later than date2, then the result is positive.</span> |
| <span class="sd"> A whole number is returned if both inputs have the same day of month or both are the last day</span> |
| <span class="sd"> of their respective months. Otherwise, the difference is calculated assuming 31 days per month.</span> |
| <span class="sd"> The result is rounded off to 8 digits unless `roundOff` is set to `False`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> date1 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> first date column.</span> |
| <span class="sd"> date2 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> second date column.</span> |
| <span class="sd"> roundOff : bool, optional</span> |
| <span class="sd"> whether to round (to 8 digits) the final value or not (default: True).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> number of months between two dates.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')], ['date1', 'date2'])</span> |
| <span class="sd"> >>> df.select(months_between(df.date1, df.date2).alias('months')).collect()</span> |
| <span class="sd"> [Row(months=3.94959677)]</span> |
| <span class="sd"> >>> df.select(months_between(df.date1, df.date2, False).alias('months')).collect()</span> |
| <span class="sd"> [Row(months=3.9495967741935485)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span> |
| <span class="s2">"months_between"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">date1</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">date2</span><span class="p">),</span> <span class="n">roundOff</span> |
| <span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="to_date"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_date.html#pyspark.sql.functions.to_date">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">to_date</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Converts a :class:`~pyspark.sql.Column` into :class:`pyspark.sql.types.DateType`</span> |
| <span class="sd"> using the optionally specified format. Specify formats according to `datetime pattern`_.</span> |
| <span class="sd"> By default, it follows casting rules to :class:`pyspark.sql.types.DateType` if the format</span> |
| <span class="sd"> is omitted. Equivalent to ``col.cast("date")``.</span> |
| |
| <span class="sd"> .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html</span> |
| |
| <span class="sd"> .. versionadded:: 2.2.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> input column of values to convert.</span> |
| <span class="sd"> format: str, optional</span> |
| <span class="sd"> format to use to convert date values.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> date value as :class:`pyspark.sql.types.DateType` type.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])</span> |
| <span class="sd"> >>> df.select(to_date(df.t).alias('date')).collect()</span> |
| <span class="sd"> [Row(date=datetime.date(1997, 2, 28))]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])</span> |
| <span class="sd"> >>> df.select(to_date(df.t, 'yyyy-MM-dd HH:mm:ss').alias('date')).collect()</span> |
| <span class="sd"> [Row(date=datetime.date(1997, 2, 28))]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">format</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"to_date"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"to_date"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="nb">format</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="unix_date"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.unix_date.html#pyspark.sql.functions.unix_date">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">unix_date</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the number of days since 1970-01-01.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('1970-01-02',)], ['t'])</span> |
| <span class="sd"> >>> df.select(unix_date(to_date(df.t)).alias('n')).collect()</span> |
| <span class="sd"> [Row(n=1)]</span> |
| <span class="sd"> >>> spark.conf.unset("spark.sql.session.timeZone")</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"unix_date"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="unix_micros"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.unix_micros.html#pyspark.sql.functions.unix_micros">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">unix_micros</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the number of microseconds since 1970-01-01 00:00:00 UTC.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('2015-07-22 10:00:00',)], ['t'])</span> |
| <span class="sd"> >>> df.select(unix_micros(to_timestamp(df.t)).alias('n')).collect()</span> |
| <span class="sd"> [Row(n=1437584400000000)]</span> |
| <span class="sd"> >>> spark.conf.unset("spark.sql.session.timeZone")</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"unix_micros"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="unix_millis"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.unix_millis.html#pyspark.sql.functions.unix_millis">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">unix_millis</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the number of milliseconds since 1970-01-01 00:00:00 UTC.</span> |
| <span class="sd"> Truncates higher levels of precision.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('2015-07-22 10:00:00',)], ['t'])</span> |
| <span class="sd"> >>> df.select(unix_millis(to_timestamp(df.t)).alias('n')).collect()</span> |
| <span class="sd"> [Row(n=1437584400000)]</span> |
| <span class="sd"> >>> spark.conf.unset("spark.sql.session.timeZone")</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"unix_millis"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="unix_seconds"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.unix_seconds.html#pyspark.sql.functions.unix_seconds">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">unix_seconds</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the number of seconds since 1970-01-01 00:00:00 UTC.</span> |
| <span class="sd"> Truncates higher levels of precision.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('2015-07-22 10:00:00',)], ['t'])</span> |
| <span class="sd"> >>> df.select(unix_seconds(to_timestamp(df.t)).alias('n')).collect()</span> |
| <span class="sd"> [Row(n=1437584400)]</span> |
| <span class="sd"> >>> spark.conf.unset("spark.sql.session.timeZone")</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"unix_seconds"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">to_timestamp</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">to_timestamp</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <div class="viewcode-block" id="to_timestamp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_timestamp.html#pyspark.sql.functions.to_timestamp">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">to_timestamp</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Converts a :class:`~pyspark.sql.Column` into :class:`pyspark.sql.types.TimestampType`</span> |
| <span class="sd"> using the optionally specified format. Specify formats according to `datetime pattern`_.</span> |
| <span class="sd"> By default, it follows casting rules to :class:`pyspark.sql.types.TimestampType` if the format</span> |
| <span class="sd"> is omitted. Equivalent to ``col.cast("timestamp")``.</span> |
| |
| <span class="sd"> .. _datetime pattern: https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html</span> |
| |
| <span class="sd"> .. versionadded:: 2.2.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column values to convert.</span> |
| <span class="sd"> format: str, optional</span> |
| <span class="sd"> format to use to convert timestamp values.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> timestamp value as :class:`pyspark.sql.types.TimestampType` type.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])</span> |
| <span class="sd"> >>> df.select(to_timestamp(df.t).alias('dt')).collect()</span> |
| <span class="sd"> [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])</span> |
| <span class="sd"> >>> df.select(to_timestamp(df.t, 'yyyy-MM-dd HH:mm:ss').alias('dt')).collect()</span> |
| <span class="sd"> [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">format</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"to_timestamp"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"to_timestamp"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="nb">format</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="try_to_timestamp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_to_timestamp.html#pyspark.sql.functions.try_to_timestamp">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">try_to_timestamp</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Parses the `col` with the `format` to a timestamp. The function always</span> |
| <span class="sd"> returns null on an invalid input with/without ANSI SQL mode enabled. The result data type is</span> |
| <span class="sd"> consistent with the value of configuration `spark.sql.timestampType`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column values to convert.</span> |
| <span class="sd"> format: str, optional</span> |
| <span class="sd"> format to use to convert timestamp values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('1997-02-28 10:30:00',)], ['t'])</span> |
| <span class="sd"> >>> df.select(try_to_timestamp(df.t).alias('dt')).collect()</span> |
| <span class="sd"> [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]</span> |
| |
| <span class="sd"> >>> df.select(try_to_timestamp(df.t, lit('yyyy-MM-dd HH:mm:ss')).alias('dt')).collect()</span> |
| <span class="sd"> [Row(dt=datetime.datetime(1997, 2, 28, 10, 30))]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">format</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"try_to_timestamp"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="nb">format</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"try_to_timestamp"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="xpath"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.xpath.html#pyspark.sql.functions.xpath">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">xpath</span><span class="p">(</span><span class="n">xml</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a string array of values within the nodes of xml that match the XPath expression.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [('<a><b>b1</b><b>b2</b><b>b3</b><c>c1</c><c>c2</c></a>',)], ['x'])</span> |
| <span class="sd"> >>> df.select(xpath(df.x, lit('a/b/text()')).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=['b1', 'b2', 'b3'])]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"xpath"</span><span class="p">,</span> <span class="n">xml</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="xpath_boolean"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.xpath_boolean.html#pyspark.sql.functions.xpath_boolean">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">xpath_boolean</span><span class="p">(</span><span class="n">xml</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns true if the XPath expression evaluates to true, or if a matching node is found.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('<a><b>1</b></a>',)], ['x'])</span> |
| <span class="sd"> >>> df.select(xpath_boolean(df.x, lit('a/b')).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=True)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"xpath_boolean"</span><span class="p">,</span> <span class="n">xml</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="xpath_double"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.xpath_double.html#pyspark.sql.functions.xpath_double">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">xpath_double</span><span class="p">(</span><span class="n">xml</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a double value, the value zero if no match is found,</span> |
| <span class="sd"> or NaN if a match is found but the value is non-numeric.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x'])</span> |
| <span class="sd"> >>> df.select(xpath_double(df.x, lit('sum(a/b)')).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=3.0)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"xpath_double"</span><span class="p">,</span> <span class="n">xml</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="xpath_number"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.xpath_number.html#pyspark.sql.functions.xpath_number">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">xpath_number</span><span class="p">(</span><span class="n">xml</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a double value, the value zero if no match is found,</span> |
| <span class="sd"> or NaN if a match is found but the value is non-numeric.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [('<a><b>1</b><b>2</b></a>',)], ['x']</span> |
| <span class="sd"> ... ).select(sf.xpath_number('x', sf.lit('sum(a/b)'))).show()</span> |
| <span class="sd"> +-------------------------+</span> |
| <span class="sd"> |xpath_number(x, sum(a/b))|</span> |
| <span class="sd"> +-------------------------+</span> |
| <span class="sd"> | 3.0|</span> |
| <span class="sd"> +-------------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"xpath_number"</span><span class="p">,</span> <span class="n">xml</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="xpath_float"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.xpath_float.html#pyspark.sql.functions.xpath_float">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">xpath_float</span><span class="p">(</span><span class="n">xml</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a float value, the value zero if no match is found,</span> |
| <span class="sd"> or NaN if a match is found but the value is non-numeric.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x'])</span> |
| <span class="sd"> >>> df.select(xpath_float(df.x, lit('sum(a/b)')).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=3.0)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"xpath_float"</span><span class="p">,</span> <span class="n">xml</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="xpath_int"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.xpath_int.html#pyspark.sql.functions.xpath_int">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">xpath_int</span><span class="p">(</span><span class="n">xml</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns an integer value, or the value zero if no match is found,</span> |
| <span class="sd"> or a match is found but the value is non-numeric.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x'])</span> |
| <span class="sd"> >>> df.select(xpath_int(df.x, lit('sum(a/b)')).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=3)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"xpath_int"</span><span class="p">,</span> <span class="n">xml</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="xpath_long"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.xpath_long.html#pyspark.sql.functions.xpath_long">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">xpath_long</span><span class="p">(</span><span class="n">xml</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a long integer value, or the value zero if no match is found,</span> |
| <span class="sd"> or a match is found but the value is non-numeric.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x'])</span> |
| <span class="sd"> >>> df.select(xpath_long(df.x, lit('sum(a/b)')).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=3)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"xpath_long"</span><span class="p">,</span> <span class="n">xml</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="xpath_short"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.xpath_short.html#pyspark.sql.functions.xpath_short">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">xpath_short</span><span class="p">(</span><span class="n">xml</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a short integer value, or the value zero if no match is found,</span> |
| <span class="sd"> or a match is found but the value is non-numeric.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('<a><b>1</b><b>2</b></a>',)], ['x'])</span> |
| <span class="sd"> >>> df.select(xpath_short(df.x, lit('sum(a/b)')).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=3)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"xpath_short"</span><span class="p">,</span> <span class="n">xml</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="xpath_string"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.xpath_string.html#pyspark.sql.functions.xpath_string">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">xpath_string</span><span class="p">(</span><span class="n">xml</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the text contents of the first xml node that matches the XPath expression.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('<a><b>b</b><c>cc</c></a>',)], ['x'])</span> |
| <span class="sd"> >>> df.select(xpath_string(df.x, lit('a/c')).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='cc')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"xpath_string"</span><span class="p">,</span> <span class="n">xml</span><span class="p">,</span> <span class="n">path</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="trunc"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.trunc.html#pyspark.sql.functions.trunc">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">trunc</span><span class="p">(</span><span class="n">date</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns date truncated to the unit specified by the format.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> date : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> input column of values to truncate.</span> |
| <span class="sd"> format : str</span> |
| <span class="sd"> 'year', 'yyyy', 'yy' to truncate by year,</span> |
| <span class="sd"> or 'month', 'mon', 'mm' to truncate by month</span> |
| <span class="sd"> Other options are: 'week', 'quarter'</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> truncated date.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('1997-02-28',)], ['d'])</span> |
| <span class="sd"> >>> df.select(trunc(df.d, 'year').alias('year')).collect()</span> |
| <span class="sd"> [Row(year=datetime.date(1997, 1, 1))]</span> |
| <span class="sd"> >>> df.select(trunc(df.d, 'mon').alias('month')).collect()</span> |
| <span class="sd"> [Row(month=datetime.date(1997, 2, 1))]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"trunc"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">date</span><span class="p">),</span> <span class="nb">format</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="date_trunc"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.date_trunc.html#pyspark.sql.functions.date_trunc">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">date_trunc</span><span class="p">(</span><span class="nb">format</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns timestamp truncated to the unit specified by the format.</span> |
| |
| <span class="sd"> .. versionadded:: 2.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> format : str</span> |
| <span class="sd"> 'year', 'yyyy', 'yy' to truncate by year,</span> |
| <span class="sd"> 'month', 'mon', 'mm' to truncate by month,</span> |
| <span class="sd"> 'day', 'dd' to truncate by day,</span> |
| <span class="sd"> Other options are:</span> |
| <span class="sd"> 'microsecond', 'millisecond', 'second', 'minute', 'hour', 'week', 'quarter'</span> |
| <span class="sd"> timestamp : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> input column of values to truncate.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> truncated timestamp.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('1997-02-28 05:02:11',)], ['t'])</span> |
| <span class="sd"> >>> df.select(date_trunc('year', df.t).alias('year')).collect()</span> |
| <span class="sd"> [Row(year=datetime.datetime(1997, 1, 1, 0, 0))]</span> |
| <span class="sd"> >>> df.select(date_trunc('mon', df.t).alias('month')).collect()</span> |
| <span class="sd"> [Row(month=datetime.datetime(1997, 2, 1, 0, 0))]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"date_trunc"</span><span class="p">,</span> <span class="nb">format</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">timestamp</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="next_day"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.next_day.html#pyspark.sql.functions.next_day">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">next_day</span><span class="p">(</span><span class="n">date</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">dayOfWeek</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the first date which is later than the value of the date column</span> |
| <span class="sd"> based on second `week day` argument.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> date : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| <span class="sd"> dayOfWeek : str</span> |
| <span class="sd"> day of the week, case-insensitive, accepts:</span> |
| <span class="sd"> "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column of computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('2015-07-27',)], ['d'])</span> |
| <span class="sd"> >>> df.select(next_day(df.d, 'Sun').alias('date')).collect()</span> |
| <span class="sd"> [Row(date=datetime.date(2015, 8, 2))]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"next_day"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">date</span><span class="p">),</span> <span class="n">dayOfWeek</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="last_day"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.last_day.html#pyspark.sql.functions.last_day">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">last_day</span><span class="p">(</span><span class="n">date</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the last day of the month which the given date belongs to.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> date : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> last day of the month.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('1997-02-10',)], ['d'])</span> |
| <span class="sd"> >>> df.select(last_day(df.d).alias('date')).collect()</span> |
| <span class="sd"> [Row(date=datetime.date(1997, 2, 28))]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"last_day"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">date</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="from_unixtime"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.from_unixtime.html#pyspark.sql.functions.from_unixtime">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">from_unixtime</span><span class="p">(</span><span class="n">timestamp</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"yyyy-MM-dd HH:mm:ss"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string</span> |
| <span class="sd"> representing the timestamp of that moment in the current system time zone in the given</span> |
| <span class="sd"> format.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> timestamp : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column of unix time values.</span> |
| <span class="sd"> format : str, optional</span> |
| <span class="sd"> format to use to convert to (default: yyyy-MM-dd HH:mm:ss)</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> formatted timestamp as string.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")</span> |
| <span class="sd"> >>> time_df = spark.createDataFrame([(1428476400,)], ['unix_time'])</span> |
| <span class="sd"> >>> time_df.select(from_unixtime('unix_time').alias('ts')).collect()</span> |
| <span class="sd"> [Row(ts='2015-04-08 00:00:00')]</span> |
| <span class="sd"> >>> spark.conf.unset("spark.sql.session.timeZone")</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"from_unixtime"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">timestamp</span><span class="p">),</span> <span class="nb">format</span><span class="p">)</span></div> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">unix_timestamp</span><span class="p">(</span><span class="n">timestamp</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="o">...</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">unix_timestamp</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <div class="viewcode-block" id="unix_timestamp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.unix_timestamp.html#pyspark.sql.functions.unix_timestamp">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">unix_timestamp</span><span class="p">(</span> |
| <span class="n">timestamp</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"yyyy-MM-dd HH:mm:ss"</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Convert time string with given pattern ('yyyy-MM-dd HH:mm:ss', by default)</span> |
| <span class="sd"> to Unix time stamp (in seconds), using the default timezone and the default</span> |
| <span class="sd"> locale, returns null if failed.</span> |
| |
| <span class="sd"> if `timestamp` is None, then it returns current timestamp.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> timestamp : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> timestamps of string values.</span> |
| <span class="sd"> format : str, optional</span> |
| <span class="sd"> alternative format to use for converting (default: yyyy-MM-dd HH:mm:ss).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> unix time as long integer.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")</span> |
| <span class="sd"> >>> time_df = spark.createDataFrame([('2015-04-08',)], ['dt'])</span> |
| <span class="sd"> >>> time_df.select(unix_timestamp('dt', 'yyyy-MM-dd').alias('unix_time')).collect()</span> |
| <span class="sd"> [Row(unix_time=1428476400)]</span> |
| <span class="sd"> >>> spark.conf.unset("spark.sql.session.timeZone")</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">timestamp</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"unix_timestamp"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"unix_timestamp"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">timestamp</span><span class="p">),</span> <span class="nb">format</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="from_utc_timestamp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.from_utc_timestamp.html#pyspark.sql.functions.from_utc_timestamp">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">from_utc_timestamp</span><span class="p">(</span><span class="n">timestamp</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">tz</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> This is a common function for databases supporting TIMESTAMP WITHOUT TIMEZONE. This function</span> |
| <span class="sd"> takes a timestamp which is timezone-agnostic, and interprets it as a timestamp in UTC, and</span> |
| <span class="sd"> renders that timestamp as a timestamp in the given time zone.</span> |
| |
| <span class="sd"> However, timestamp in Spark represents number of microseconds from the Unix epoch, which is not</span> |
| <span class="sd"> timezone-agnostic. So in Spark this function just shift the timestamp value from UTC timezone to</span> |
| <span class="sd"> the given timezone.</span> |
| |
| <span class="sd"> This function may return confusing result if the input is a string with timezone, e.g.</span> |
| <span class="sd"> '2018-03-13T06:18:23+00:00'. The reason is that, Spark firstly cast the string to timestamp</span> |
| <span class="sd"> according to the timezone in the string, and finally display the result by converting the</span> |
| <span class="sd"> timestamp to string according to the session local timezone.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> timestamp : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the column that contains timestamps</span> |
| <span class="sd"> tz : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A string detailing the time zone ID that the input should be adjusted to. It should</span> |
| <span class="sd"> be in the format of either region-based zone IDs or zone offsets. Region IDs must</span> |
| <span class="sd"> have the form 'area/city', such as 'America/Los_Angeles'. Zone offsets must be in</span> |
| <span class="sd"> the format '(+|-)HH:mm', for example '-08:00' or '+01:00'. Also 'UTC' and 'Z' are</span> |
| <span class="sd"> supported as aliases of '+00:00'. Other short names are not recommended to use</span> |
| <span class="sd"> because they can be ambiguous.</span> |
| |
| <span class="sd"> .. versionchanged:: 2.4</span> |
| <span class="sd"> `tz` can take a :class:`~pyspark.sql.Column` containing timezone ID strings.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> timestamp value represented in given timezone.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('1997-02-28 10:30:00', 'JST')], ['ts', 'tz'])</span> |
| <span class="sd"> >>> df.select(from_utc_timestamp(df.ts, "PST").alias('local_time')).collect()</span> |
| <span class="sd"> [Row(local_time=datetime.datetime(1997, 2, 28, 2, 30))]</span> |
| <span class="sd"> >>> df.select(from_utc_timestamp(df.ts, df.tz).alias('local_time')).collect()</span> |
| <span class="sd"> [Row(local_time=datetime.datetime(1997, 2, 28, 19, 30))]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">tz</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span> |
| <span class="n">tz</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">tz</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"from_utc_timestamp"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">timestamp</span><span class="p">),</span> <span class="n">tz</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="to_utc_timestamp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_utc_timestamp.html#pyspark.sql.functions.to_utc_timestamp">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">to_utc_timestamp</span><span class="p">(</span><span class="n">timestamp</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">tz</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> This is a common function for databases supporting TIMESTAMP WITHOUT TIMEZONE. This function</span> |
| <span class="sd"> takes a timestamp which is timezone-agnostic, and interprets it as a timestamp in the given</span> |
| <span class="sd"> timezone, and renders that timestamp as a timestamp in UTC.</span> |
| |
| <span class="sd"> However, timestamp in Spark represents number of microseconds from the Unix epoch, which is not</span> |
| <span class="sd"> timezone-agnostic. So in Spark this function just shift the timestamp value from the given</span> |
| <span class="sd"> timezone to UTC timezone.</span> |
| |
| <span class="sd"> This function may return confusing result if the input is a string with timezone, e.g.</span> |
| <span class="sd"> '2018-03-13T06:18:23+00:00'. The reason is that, Spark firstly cast the string to timestamp</span> |
| <span class="sd"> according to the timezone in the string, and finally display the result by converting the</span> |
| <span class="sd"> timestamp to string according to the session local timezone.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> timestamp : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the column that contains timestamps</span> |
| <span class="sd"> tz : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A string detailing the time zone ID that the input should be adjusted to. It should</span> |
| <span class="sd"> be in the format of either region-based zone IDs or zone offsets. Region IDs must</span> |
| <span class="sd"> have the form 'area/city', such as 'America/Los_Angeles'. Zone offsets must be in</span> |
| <span class="sd"> the format '(+|-)HH:mm', for example '-08:00' or '+01:00'. Also 'UTC' and 'Z' are</span> |
| <span class="sd"> supported as aliases of '+00:00'. Other short names are not recommended to use</span> |
| <span class="sd"> because they can be ambiguous.</span> |
| |
| <span class="sd"> .. versionchanged:: 2.4.0</span> |
| <span class="sd"> `tz` can take a :class:`~pyspark.sql.Column` containing timezone ID strings.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> timestamp value represented in UTC timezone.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('1997-02-28 10:30:00', 'JST')], ['ts', 'tz'])</span> |
| <span class="sd"> >>> df.select(to_utc_timestamp(df.ts, "PST").alias('utc_time')).collect()</span> |
| <span class="sd"> [Row(utc_time=datetime.datetime(1997, 2, 28, 18, 30))]</span> |
| <span class="sd"> >>> df.select(to_utc_timestamp(df.ts, df.tz).alias('utc_time')).collect()</span> |
| <span class="sd"> [Row(utc_time=datetime.datetime(1997, 2, 28, 1, 30))]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">tz</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span> |
| <span class="n">tz</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">tz</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"to_utc_timestamp"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">timestamp</span><span class="p">),</span> <span class="n">tz</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="timestamp_seconds"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.timestamp_seconds.html#pyspark.sql.functions.timestamp_seconds">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">timestamp_seconds</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Converts the number of seconds from the Unix epoch (1970-01-01T00:00:00Z)</span> |
| <span class="sd"> to a timestamp.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> unix time values.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> converted timestamp value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.functions import timestamp_seconds</span> |
| <span class="sd"> >>> spark.conf.set("spark.sql.session.timeZone", "UTC")</span> |
| <span class="sd"> >>> time_df = spark.createDataFrame([(1230219000,)], ['unix_time'])</span> |
| <span class="sd"> >>> time_df.select(timestamp_seconds(time_df.unix_time).alias('ts')).show()</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> | ts|</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> |2008-12-25 15:30:00|</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> >>> time_df.select(timestamp_seconds('unix_time').alias('ts')).printSchema()</span> |
| <span class="sd"> root</span> |
| <span class="sd"> |-- ts: timestamp (nullable = true)</span> |
| <span class="sd"> >>> spark.conf.unset("spark.sql.session.timeZone")</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"timestamp_seconds"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="timestamp_millis"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.timestamp_millis.html#pyspark.sql.functions.timestamp_millis">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">timestamp_millis</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Creates timestamp from the number of milliseconds since UTC epoch.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> unix time values.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> converted timestamp value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.conf.set("spark.sql.session.timeZone", "UTC")</span> |
| <span class="sd"> >>> time_df = spark.createDataFrame([(1230219000,)], ['unix_time'])</span> |
| <span class="sd"> >>> time_df.select(timestamp_millis(time_df.unix_time).alias('ts')).show()</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> | ts|</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> |1970-01-15 05:43:39|</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> >>> time_df.select(timestamp_millis('unix_time').alias('ts')).printSchema()</span> |
| <span class="sd"> root</span> |
| <span class="sd"> |-- ts: timestamp (nullable = true)</span> |
| <span class="sd"> >>> spark.conf.unset("spark.sql.session.timeZone")</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"timestamp_millis"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="timestamp_micros"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.timestamp_micros.html#pyspark.sql.functions.timestamp_micros">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">timestamp_micros</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Creates timestamp from the number of microseconds since UTC epoch.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> unix time values.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> converted timestamp value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.conf.set("spark.sql.session.timeZone", "UTC")</span> |
| <span class="sd"> >>> time_df = spark.createDataFrame([(1230219000,)], ['unix_time'])</span> |
| <span class="sd"> >>> time_df.select(timestamp_micros(time_df.unix_time).alias('ts')).show()</span> |
| <span class="sd"> +--------------------+</span> |
| <span class="sd"> | ts|</span> |
| <span class="sd"> +--------------------+</span> |
| <span class="sd"> |1970-01-01 00:20:...|</span> |
| <span class="sd"> +--------------------+</span> |
| <span class="sd"> >>> time_df.select(timestamp_micros('unix_time').alias('ts')).printSchema()</span> |
| <span class="sd"> root</span> |
| <span class="sd"> |-- ts: timestamp (nullable = true)</span> |
| <span class="sd"> >>> spark.conf.unset("spark.sql.session.timeZone")</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"timestamp_micros"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="window"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.window.html#pyspark.sql.functions.window">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">window</span><span class="p">(</span> |
| <span class="n">timeColumn</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">windowDuration</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="n">slideDuration</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">startTime</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Bucketize rows into one or more time windows given a timestamp specifying column. Window</span> |
| <span class="sd"> starts are inclusive but the window ends are exclusive, e.g. 12:05 will be in the window</span> |
| <span class="sd"> [12:05,12:10) but not in [12:00,12:05). Windows can support microsecond precision. Windows in</span> |
| <span class="sd"> the order of months are not supported.</span> |
| |
| <span class="sd"> The time column must be of :class:`pyspark.sql.types.TimestampType`.</span> |
| |
| <span class="sd"> Durations are provided as strings, e.g. '1 second', '1 day 12 hours', '2 minutes'. Valid</span> |
| <span class="sd"> interval strings are 'week', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond'.</span> |
| <span class="sd"> If the ``slideDuration`` is not provided, the windows will be tumbling windows.</span> |
| |
| <span class="sd"> The startTime is the offset with respect to 1970-01-01 00:00:00 UTC with which to start</span> |
| <span class="sd"> window intervals. For example, in order to have hourly tumbling windows that start 15 minutes</span> |
| <span class="sd"> past the hour, e.g. 12:15-13:15, 13:15-14:15... provide `startTime` as `15 minutes`.</span> |
| |
| <span class="sd"> The output column will be a struct called 'window' by default with the nested columns 'start'</span> |
| <span class="sd"> and 'end', where 'start' and 'end' will be of :class:`pyspark.sql.types.TimestampType`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> timeColumn : :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> The column or the expression to use as the timestamp for windowing by time.</span> |
| <span class="sd"> The time column must be of TimestampType or TimestampNTZType.</span> |
| <span class="sd"> windowDuration : str</span> |
| <span class="sd"> A string specifying the width of the window, e.g. `10 minutes`,</span> |
| <span class="sd"> `1 second`. Check `org.apache.spark.unsafe.types.CalendarInterval` for</span> |
| <span class="sd"> valid duration identifiers. Note that the duration is a fixed length of</span> |
| <span class="sd"> time, and does not vary over time according to a calendar. For example,</span> |
| <span class="sd"> `1 day` always means 86,400,000 milliseconds, not a calendar day.</span> |
| <span class="sd"> slideDuration : str, optional</span> |
| <span class="sd"> A new window will be generated every `slideDuration`. Must be less than</span> |
| <span class="sd"> or equal to the `windowDuration`. Check</span> |
| <span class="sd"> `org.apache.spark.unsafe.types.CalendarInterval` for valid duration</span> |
| <span class="sd"> identifiers. This duration is likewise absolute, and does not vary</span> |
| <span class="sd"> according to a calendar.</span> |
| <span class="sd"> startTime : str, optional</span> |
| <span class="sd"> The offset with respect to 1970-01-01 00:00:00 UTC with which to start</span> |
| <span class="sd"> window intervals. For example, in order to have hourly tumbling windows that</span> |
| <span class="sd"> start 15 minutes past the hour, e.g. 12:15-13:15, 13:15-14:15... provide</span> |
| <span class="sd"> `startTime` as `15 minutes`.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import datetime</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)],</span> |
| <span class="sd"> ... ).toDF("date", "val")</span> |
| <span class="sd"> >>> w = df.groupBy(window("date", "5 seconds")).agg(sum("val").alias("sum"))</span> |
| <span class="sd"> >>> w.select(w.window.start.cast("string").alias("start"),</span> |
| <span class="sd"> ... w.window.end.cast("string").alias("end"), "sum").collect()</span> |
| <span class="sd"> [Row(start='2016-03-11 09:00:05', end='2016-03-11 09:00:10', sum=1)]</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="nf">check_string_field</span><span class="p">(</span><span class="n">field</span><span class="p">,</span> <span class="n">fieldName</span><span class="p">):</span> <span class="c1"># type: ignore[no-untyped-def]</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">field</span> <span class="ow">or</span> <span class="nb">type</span><span class="p">(</span><span class="n">field</span><span class="p">)</span> <span class="ow">is</span> <span class="ow">not</span> <span class="nb">str</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_STR"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"arg_name"</span><span class="p">:</span> <span class="n">fieldName</span><span class="p">,</span> <span class="s2">"arg_type"</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">field</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="n">time_col</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">timeColumn</span><span class="p">)</span> |
| <span class="n">check_string_field</span><span class="p">(</span><span class="n">windowDuration</span><span class="p">,</span> <span class="s2">"windowDuration"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">slideDuration</span> <span class="ow">and</span> <span class="n">startTime</span><span class="p">:</span> |
| <span class="n">check_string_field</span><span class="p">(</span><span class="n">slideDuration</span><span class="p">,</span> <span class="s2">"slideDuration"</span><span class="p">)</span> |
| <span class="n">check_string_field</span><span class="p">(</span><span class="n">startTime</span><span class="p">,</span> <span class="s2">"startTime"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"window"</span><span class="p">,</span> <span class="n">time_col</span><span class="p">,</span> <span class="n">windowDuration</span><span class="p">,</span> <span class="n">slideDuration</span><span class="p">,</span> <span class="n">startTime</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="n">slideDuration</span><span class="p">:</span> |
| <span class="n">check_string_field</span><span class="p">(</span><span class="n">slideDuration</span><span class="p">,</span> <span class="s2">"slideDuration"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"window"</span><span class="p">,</span> <span class="n">time_col</span><span class="p">,</span> <span class="n">windowDuration</span><span class="p">,</span> <span class="n">slideDuration</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="n">startTime</span><span class="p">:</span> |
| <span class="n">check_string_field</span><span class="p">(</span><span class="n">startTime</span><span class="p">,</span> <span class="s2">"startTime"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"window"</span><span class="p">,</span> <span class="n">time_col</span><span class="p">,</span> <span class="n">windowDuration</span><span class="p">,</span> <span class="n">windowDuration</span><span class="p">,</span> <span class="n">startTime</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"window"</span><span class="p">,</span> <span class="n">time_col</span><span class="p">,</span> <span class="n">windowDuration</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="window_time"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.window_time.html#pyspark.sql.functions.window_time">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">window_time</span><span class="p">(</span> |
| <span class="n">windowColumn</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Computes the event time from a window column. The column window values are produced</span> |
| <span class="sd"> by window aggregating operators and are of type `STRUCT<start: TIMESTAMP, end: TIMESTAMP>`</span> |
| <span class="sd"> where start is inclusive and end is exclusive. The event time of records produced by window</span> |
| <span class="sd"> aggregating operators can be computed as ``window_time(window)`` and are</span> |
| <span class="sd"> ``window.end - lit(1).alias("microsecond")`` (as microsecond is the minimal supported event</span> |
| <span class="sd"> time precision). The window column must be one produced by a window aggregating operator.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> windowColumn : :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> The window column of a window aggregate records.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import datetime</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(datetime.datetime(2016, 3, 11, 9, 0, 7), 1)],</span> |
| <span class="sd"> ... ).toDF("date", "val")</span> |
| |
| <span class="sd"> Group the data into 5 second time windows and aggregate as sum.</span> |
| |
| <span class="sd"> >>> w = df.groupBy(window("date", "5 seconds")).agg(sum("val").alias("sum"))</span> |
| |
| <span class="sd"> Extract the window event time using the window_time function.</span> |
| |
| <span class="sd"> >>> w.select(</span> |
| <span class="sd"> ... w.window.end.cast("string").alias("end"),</span> |
| <span class="sd"> ... window_time(w.window).cast("string").alias("window_time"),</span> |
| <span class="sd"> ... "sum"</span> |
| <span class="sd"> ... ).collect()</span> |
| <span class="sd"> [Row(end='2016-03-11 09:00:10', window_time='2016-03-11 09:00:09.999999', sum=1)]</span> |
| <span class="sd"> """</span> |
| <span class="n">window_col</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">windowColumn</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"window_time"</span><span class="p">,</span> <span class="n">window_col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="session_window"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.session_window.html#pyspark.sql.functions.session_window">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">session_window</span><span class="p">(</span><span class="n">timeColumn</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">gapDuration</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Generates session window given a timestamp specifying column.</span> |
| <span class="sd"> Session window is one of dynamic windows, which means the length of window is varying</span> |
| <span class="sd"> according to the given inputs. The length of session window is defined as "the timestamp</span> |
| <span class="sd"> of latest input of the session + gap duration", so when the new inputs are bound to the</span> |
| <span class="sd"> current session window, the end time of session window can be expanded according to the new</span> |
| <span class="sd"> inputs.</span> |
| <span class="sd"> Windows can support microsecond precision. Windows in the order of months are not supported.</span> |
| <span class="sd"> For a streaming query, you may use the function `current_timestamp` to generate windows on</span> |
| <span class="sd"> processing time.</span> |
| <span class="sd"> gapDuration is provided as strings, e.g. '1 second', '1 day 12 hours', '2 minutes'. Valid</span> |
| <span class="sd"> interval strings are 'week', 'day', 'hour', 'minute', 'second', 'millisecond', 'microsecond'.</span> |
| <span class="sd"> It could also be a Column which can be evaluated to gap duration dynamically based on the</span> |
| <span class="sd"> input row.</span> |
| <span class="sd"> The output column will be a struct called 'session_window' by default with the nested columns</span> |
| <span class="sd"> 'start' and 'end', where 'start' and 'end' will be of :class:`pyspark.sql.types.TimestampType`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.2.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> timeColumn : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The column name or column to use as the timestamp for windowing by time.</span> |
| <span class="sd"> The time column must be of TimestampType or TimestampNTZType.</span> |
| <span class="sd"> gapDuration : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A Python string literal or column specifying the timeout of the session. It could be</span> |
| <span class="sd"> static value, e.g. `10 minutes`, `1 second`, or an expression/UDF that specifies gap</span> |
| <span class="sd"> duration dynamically based on the input row.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("2016-03-11 09:00:07", 1)]).toDF("date", "val")</span> |
| <span class="sd"> >>> w = df.groupBy(session_window("date", "5 seconds")).agg(sum("val").alias("sum"))</span> |
| <span class="sd"> >>> w.select(w.session_window.start.cast("string").alias("start"),</span> |
| <span class="sd"> ... w.session_window.end.cast("string").alias("end"), "sum").collect()</span> |
| <span class="sd"> [Row(start='2016-03-11 09:00:07', end='2016-03-11 09:00:12', sum=1)]</span> |
| <span class="sd"> >>> w = df.groupBy(session_window("date", lit("5 seconds"))).agg(sum("val").alias("sum"))</span> |
| <span class="sd"> >>> w.select(w.session_window.start.cast("string").alias("start"),</span> |
| <span class="sd"> ... w.session_window.end.cast("string").alias("end"), "sum").collect()</span> |
| <span class="sd"> [Row(start='2016-03-11 09:00:07', end='2016-03-11 09:00:12', sum=1)]</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="nf">check_field</span><span class="p">(</span><span class="n">field</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> <span class="n">fieldName</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">field</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">field</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_COLUMN_OR_STR"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"arg_name"</span><span class="p">:</span> <span class="n">fieldName</span><span class="p">,</span> <span class="s2">"arg_type"</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">field</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="n">time_col</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">timeColumn</span><span class="p">)</span> |
| <span class="n">check_field</span><span class="p">(</span><span class="n">gapDuration</span><span class="p">,</span> <span class="s2">"gapDuration"</span><span class="p">)</span> |
| <span class="n">gap_duration</span> <span class="o">=</span> <span class="n">gapDuration</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">gapDuration</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">gapDuration</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"session_window"</span><span class="p">,</span> <span class="n">time_col</span><span class="p">,</span> <span class="n">gap_duration</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="to_unix_timestamp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_unix_timestamp.html#pyspark.sql.functions.to_unix_timestamp">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">to_unix_timestamp</span><span class="p">(</span> |
| <span class="n">timestamp</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the UNIX timestamp of the given time.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> timestamp : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input column or strings.</span> |
| <span class="sd"> format : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> format to use to convert UNIX timestamp values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("2016-04-08",)], ["e"])</span> |
| <span class="sd"> >>> df.select(to_unix_timestamp(df.e, lit("yyyy-MM-dd")).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=1460098800)]</span> |
| <span class="sd"> >>> spark.conf.unset("spark.sql.session.timeZone")</span> |
| |
| <span class="sd"> >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("2016-04-08",)], ["e"])</span> |
| <span class="sd"> >>> df.select(to_unix_timestamp(df.e).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=None)]</span> |
| <span class="sd"> >>> spark.conf.unset("spark.sql.session.timeZone")</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">format</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"to_unix_timestamp"</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">,</span> <span class="nb">format</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"to_unix_timestamp"</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="to_timestamp_ltz"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_timestamp_ltz.html#pyspark.sql.functions.to_timestamp_ltz">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">to_timestamp_ltz</span><span class="p">(</span> |
| <span class="n">timestamp</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Parses the `timestamp` with the `format` to a timestamp without time zone.</span> |
| <span class="sd"> Returns null with invalid input.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> timestamp : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input column or strings.</span> |
| <span class="sd"> format : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> format to use to convert type `TimestampType` timestamp values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("2016-12-31",)], ["e"])</span> |
| <span class="sd"> >>> df.select(to_timestamp_ltz(df.e, lit("yyyy-MM-dd")).alias('r')).collect()</span> |
| <span class="sd"> ... # doctest: +SKIP</span> |
| <span class="sd"> [Row(r=datetime.datetime(2016, 12, 31, 0, 0))]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([("2016-12-31",)], ["e"])</span> |
| <span class="sd"> >>> df.select(to_timestamp_ltz(df.e).alias('r')).collect()</span> |
| <span class="sd"> ... # doctest: +SKIP</span> |
| <span class="sd"> [Row(r=datetime.datetime(2016, 12, 31, 0, 0))]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">format</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"to_timestamp_ltz"</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">,</span> <span class="nb">format</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"to_timestamp_ltz"</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="to_timestamp_ntz"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_timestamp_ntz.html#pyspark.sql.functions.to_timestamp_ntz">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">to_timestamp_ntz</span><span class="p">(</span> |
| <span class="n">timestamp</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Parses the `timestamp` with the `format` to a timestamp without time zone.</span> |
| <span class="sd"> Returns null with invalid input.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> timestamp : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input column or strings.</span> |
| <span class="sd"> format : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> format to use to convert type `TimestampNTZType` timestamp values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("2016-04-08",)], ["e"])</span> |
| <span class="sd"> >>> df.select(to_timestamp_ntz(df.e, lit("yyyy-MM-dd")).alias('r')).collect()</span> |
| <span class="sd"> ... # doctest: +SKIP</span> |
| <span class="sd"> [Row(r=datetime.datetime(2016, 4, 8, 0, 0))]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([("2016-04-08",)], ["e"])</span> |
| <span class="sd"> >>> df.select(to_timestamp_ntz(df.e).alias('r')).collect()</span> |
| <span class="sd"> ... # doctest: +SKIP</span> |
| <span class="sd"> [Row(r=datetime.datetime(2016, 4, 8, 0, 0))]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">format</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"to_timestamp_ntz"</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">,</span> <span class="nb">format</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"to_timestamp_ntz"</span><span class="p">,</span> <span class="n">timestamp</span><span class="p">)</span></div> |
| |
| |
| <span class="c1"># ---------------------------- misc functions ----------------------------------</span> |
| |
| |
| <div class="viewcode-block" id="current_catalog"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.current_catalog.html#pyspark.sql.functions.current_catalog">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">current_catalog</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the current catalog.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.range(1).select(current_catalog()).show()</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> |current_catalog()|</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> | spark_catalog|</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"current_catalog"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="current_database"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.current_database.html#pyspark.sql.functions.current_database">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">current_database</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the current database.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.range(1).select(current_database()).show()</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> |current_database()|</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> | default|</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"current_database"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="current_schema"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.current_schema.html#pyspark.sql.functions.current_schema">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">current_schema</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the current database.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.range(1).select(sf.current_schema()).show()</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> |current_database()|</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> | default|</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"current_schema"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="current_user"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.current_user.html#pyspark.sql.functions.current_user">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">current_user</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the current database.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.range(1).select(current_user()).show() # doctest: +SKIP</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> |current_user()|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> | ruifeng.zheng|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"current_user"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="user"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.user.html#pyspark.sql.functions.user">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">user</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the current database.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.range(1).select(sf.user()).show() # doctest: +SKIP</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> |current_user()|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> | ruifeng.zheng|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"user"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="crc32"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.crc32.html#pyspark.sql.functions.crc32">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">crc32</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Calculates the cyclic redundancy check value (CRC32) of a binary column and</span> |
| <span class="sd"> returns the value as a bigint.</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.createDataFrame([('ABC',)], ['a']).select(crc32('a').alias('crc32')).collect()</span> |
| <span class="sd"> [Row(crc32=2743272264)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"crc32"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="md5"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.md5.html#pyspark.sql.functions.md5">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">md5</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Calculates the MD5 digest and returns the value as a 32 character hex string.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.createDataFrame([('ABC',)], ['a']).select(md5('a').alias('hash')).collect()</span> |
| <span class="sd"> [Row(hash='902fbdd2b1df0c4f70b4a5d23525e932')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"md5"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="sha1"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sha1.html#pyspark.sql.functions.sha1">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">sha1</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the hex string result of SHA-1.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.createDataFrame([('ABC',)], ['a']).select(sha1('a').alias('hash')).collect()</span> |
| <span class="sd"> [Row(hash='3c01bdbb26f358bab27f267924aa2c9a03fcfdb8')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"sha1"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="sha2"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sha2.html#pyspark.sql.functions.sha2">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">sha2</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">numBits</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the hex string result of SHA-2 family of hash functions (SHA-224, SHA-256, SHA-384,</span> |
| <span class="sd"> and SHA-512). The numBits indicates the desired bit length of the result, which must have a</span> |
| <span class="sd"> value of 224, 256, 384, 512, or 0 (which is equivalent to 256).</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| <span class="sd"> numBits : int</span> |
| <span class="sd"> the desired bit length of the result, which must have a</span> |
| <span class="sd"> value of 224, 256, 384, 512, or 0 (which is equivalent to 256).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([["Alice"], ["Bob"]], ["name"])</span> |
| <span class="sd"> >>> df.withColumn("sha2", sha2(df.name, 256)).show(truncate=False)</span> |
| <span class="sd"> +-----+----------------------------------------------------------------+</span> |
| <span class="sd"> |name |sha2 |</span> |
| <span class="sd"> +-----+----------------------------------------------------------------+</span> |
| <span class="sd"> |Alice|3bc51062973c458d5a6f2d8d64a023246354ad7e064b1e4e009ec8a0699a3043|</span> |
| <span class="sd"> |Bob |cd9fb1e148ccd8442e5aa74904cc73bf6fb54d1d54d333bd596aa9bb4bb4e961|</span> |
| <span class="sd"> +-----+----------------------------------------------------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"sha2"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">numBits</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="hash"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.hash.html#pyspark.sql.functions.hash">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">hash</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Calculates the hash code of given columns, and returns the result as an int column.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> one or more columns to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> hash value as int column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('ABC', 'DEF')], ['c1', 'c2'])</span> |
| |
| <span class="sd"> Hash for one column</span> |
| |
| <span class="sd"> >>> df.select(hash('c1').alias('hash')).show()</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> | hash|</span> |
| <span class="sd"> +----------+</span> |
| <span class="sd"> |-757602832|</span> |
| <span class="sd"> +----------+</span> |
| |
| <span class="sd"> Two or more columns</span> |
| |
| <span class="sd"> >>> df.select(hash('c1', 'c2').alias('hash')).show()</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> | hash|</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> |599895104|</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">"hash"</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="xxhash64"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.xxhash64.html#pyspark.sql.functions.xxhash64">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">xxhash64</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Calculates the hash code of given columns using the 64-bit variant of the xxHash algorithm,</span> |
| <span class="sd"> and returns the result as a long column. The hash computation uses an initial seed of 42.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> one or more columns to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> hash value as long column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('ABC', 'DEF')], ['c1', 'c2'])</span> |
| |
| <span class="sd"> Hash for one column</span> |
| |
| <span class="sd"> >>> df.select(xxhash64('c1').alias('hash')).show()</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> | hash|</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> |4105715581806190027|</span> |
| <span class="sd"> +-------------------+</span> |
| |
| <span class="sd"> Two or more columns</span> |
| |
| <span class="sd"> >>> df.select(xxhash64('c1', 'c2').alias('hash')).show()</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> | hash|</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> |3233247871021311208|</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">"xxhash64"</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="assert_true"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.assert_true.html#pyspark.sql.functions.assert_true">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">assert_true</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">errMsg</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns `null` if the input column is `true`; throws an exception</span> |
| <span class="sd"> with the provided error message otherwise.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column name or column that represents the input column to test</span> |
| <span class="sd"> errMsg : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> A Python string literal or column containing the error message</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> `null` if the input column is `true` otherwise throws an error with specified message.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(0,1)], ['a', 'b'])</span> |
| <span class="sd"> >>> df.select(assert_true(df.a < df.b).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=None)]</span> |
| <span class="sd"> >>> df.select(assert_true(df.a < df.b, df.a).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=None)]</span> |
| <span class="sd"> >>> df.select(assert_true(df.a < df.b, 'error').alias('r')).collect()</span> |
| <span class="sd"> [Row(r=None)]</span> |
| <span class="sd"> >>> df.select(assert_true(df.a > df.b, 'My error msg').alias('r')).collect() # doctest: +SKIP</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> java.lang.RuntimeException: My error msg</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">errMsg</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"assert_true"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">errMsg</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_COLUMN_OR_STR"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"arg_name"</span><span class="p">:</span> <span class="s2">"errMsg"</span><span class="p">,</span> <span class="s2">"arg_type"</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">errMsg</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="n">errMsg</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">errMsg</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">errMsg</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">errMsg</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"assert_true"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">errMsg</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="raise_error"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.raise_error.html#pyspark.sql.functions.raise_error">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">raise_error</span><span class="p">(</span><span class="n">errMsg</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Throws an exception with the provided error message.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> errMsg : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A Python string literal or column containing the error message</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> throws an error with specified message.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(raise_error("My error message")).show() # doctest: +SKIP</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> java.lang.RuntimeException: My error message</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">errMsg</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_COLUMN_OR_STR"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"arg_name"</span><span class="p">:</span> <span class="s2">"errMsg"</span><span class="p">,</span> <span class="s2">"arg_type"</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">errMsg</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="n">errMsg</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">errMsg</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">errMsg</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">errMsg</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"raise_error"</span><span class="p">,</span> <span class="n">errMsg</span><span class="p">)</span></div> |
| |
| |
| <span class="c1"># ---------------------- String/Binary functions ------------------------------</span> |
| |
| |
| <div class="viewcode-block" id="upper"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.upper.html#pyspark.sql.functions.upper">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">upper</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Converts a string expression to upper case.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> upper case values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING")</span> |
| <span class="sd"> >>> df.select(upper("value")).show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |upper(value)|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | SPARK|</span> |
| <span class="sd"> | PYSPARK|</span> |
| <span class="sd"> | PANDAS API|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"upper"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="lower"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.lower.html#pyspark.sql.functions.lower">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">lower</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Converts a string expression to lower case.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> lower case values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING")</span> |
| <span class="sd"> >>> df.select(lower("value")).show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |lower(value)|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | spark|</span> |
| <span class="sd"> | pyspark|</span> |
| <span class="sd"> | pandas api|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"lower"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="ascii"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.ascii.html#pyspark.sql.functions.ascii">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">ascii</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes the numeric value of the first character of the string column.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> numeric value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING")</span> |
| <span class="sd"> >>> df.select(ascii("value")).show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |ascii(value)|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | 83|</span> |
| <span class="sd"> | 80|</span> |
| <span class="sd"> | 80|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"ascii"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="base64"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.base64.html#pyspark.sql.functions.base64">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">base64</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes the BASE64 encoding of a binary column and returns it as a string column.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> BASE64 encoding of string value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING")</span> |
| <span class="sd"> >>> df.select(base64("value")).show()</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> | base64(value)|</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> | U3Bhcms=|</span> |
| <span class="sd"> | UHlTcGFyaw==|</span> |
| <span class="sd"> |UGFuZGFzIEFQSQ==|</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"base64"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="unbase64"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.unbase64.html#pyspark.sql.functions.unbase64">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">unbase64</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Decodes a BASE64 encoded string column and returns it as a binary column.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> encoded string value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(["U3Bhcms=",</span> |
| <span class="sd"> ... "UHlTcGFyaw==",</span> |
| <span class="sd"> ... "UGFuZGFzIEFQSQ=="], "STRING")</span> |
| <span class="sd"> >>> df.select(unbase64("value")).show()</span> |
| <span class="sd"> +--------------------+</span> |
| <span class="sd"> | unbase64(value)|</span> |
| <span class="sd"> +--------------------+</span> |
| <span class="sd"> | [53 70 61 72 6B]|</span> |
| <span class="sd"> |[50 79 53 70 61 7...|</span> |
| <span class="sd"> |[50 61 6E 64 61 7...|</span> |
| <span class="sd"> +--------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"unbase64"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="ltrim"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.ltrim.html#pyspark.sql.functions.ltrim">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">ltrim</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Trim the spaces from left end for the specified string value.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> left trimmed values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([" Spark", "Spark ", " Spark"], "STRING")</span> |
| <span class="sd"> >>> df.select(ltrim("value").alias("r")).withColumn("length", length("r")).show()</span> |
| <span class="sd"> +-------+------+</span> |
| <span class="sd"> | r|length|</span> |
| <span class="sd"> +-------+------+</span> |
| <span class="sd"> | Spark| 5|</span> |
| <span class="sd"> |Spark | 7|</span> |
| <span class="sd"> | Spark| 5|</span> |
| <span class="sd"> +-------+------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"ltrim"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="rtrim"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.rtrim.html#pyspark.sql.functions.rtrim">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">rtrim</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Trim the spaces from right end for the specified string value.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> right trimmed values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([" Spark", "Spark ", " Spark"], "STRING")</span> |
| <span class="sd"> >>> df.select(rtrim("value").alias("r")).withColumn("length", length("r")).show()</span> |
| <span class="sd"> +--------+------+</span> |
| <span class="sd"> | r|length|</span> |
| <span class="sd"> +--------+------+</span> |
| <span class="sd"> | Spark| 8|</span> |
| <span class="sd"> | Spark| 5|</span> |
| <span class="sd"> | Spark| 6|</span> |
| <span class="sd"> +--------+------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"rtrim"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="trim"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.trim.html#pyspark.sql.functions.trim">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">trim</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Trim the spaces from both ends for the specified string column.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> trimmed values from both sides.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([" Spark", "Spark ", " Spark"], "STRING")</span> |
| <span class="sd"> >>> df.select(trim("value").alias("r")).withColumn("length", length("r")).show()</span> |
| <span class="sd"> +-----+------+</span> |
| <span class="sd"> | r|length|</span> |
| <span class="sd"> +-----+------+</span> |
| <span class="sd"> |Spark| 5|</span> |
| <span class="sd"> |Spark| 5|</span> |
| <span class="sd"> |Spark| 5|</span> |
| <span class="sd"> +-----+------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"trim"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="concat_ws"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.concat_ws.html#pyspark.sql.functions.concat_ws">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">concat_ws</span><span class="p">(</span><span class="n">sep</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Concatenates multiple input string columns together into a single string column,</span> |
| <span class="sd"> using the given separator.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> sep : str</span> |
| <span class="sd"> words separator.</span> |
| <span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> list of columns to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> string of concatenated words.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('abcd','123')], ['s', 'd'])</span> |
| <span class="sd"> >>> df.select(concat_ws('-', df.s, df.d).alias('s')).collect()</span> |
| <span class="sd"> [Row(s='abcd-123')]</span> |
| <span class="sd"> """</span> |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"concat_ws"</span><span class="p">,</span> <span class="n">sep</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="decode"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.decode.html#pyspark.sql.functions.decode">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">decode</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">charset</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes the first argument into a string from a binary using the provided character set</span> |
| <span class="sd"> (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> charset : str</span> |
| <span class="sd"> charset to use to decode to.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('abcd',)], ['a'])</span> |
| <span class="sd"> >>> df.select(decode("a", "UTF-8")).show()</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> |decode(a, UTF-8)|</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> | abcd|</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"decode"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">charset</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="encode"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.encode.html#pyspark.sql.functions.encode">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">encode</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">charset</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes the first argument into a binary from a string using the provided character set</span> |
| <span class="sd"> (one of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16').</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> charset : str</span> |
| <span class="sd"> charset to use to encode.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column for computed results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('abcd',)], ['c'])</span> |
| <span class="sd"> >>> df.select(encode("c", "UTF-8")).show()</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> |encode(c, UTF-8)|</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> | [61 62 63 64]|</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"encode"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">charset</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="format_number"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.format_number.html#pyspark.sql.functions.format_number">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">format_number</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">d</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Formats the number X to a format like '#,--#,--#.--', rounded to d decimal places</span> |
| <span class="sd"> with HALF_EVEN round mode, and returns the result as a string.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the column name of the numeric value to be formatted</span> |
| <span class="sd"> d : int</span> |
| <span class="sd"> the N decimal places</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column of formatted results.</span> |
| |
| <span class="sd"> >>> spark.createDataFrame([(5,)], ['a']).select(format_number('a', 4).alias('v')).collect()</span> |
| <span class="sd"> [Row(v='5.0000')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"format_number"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">d</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="format_string"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.format_string.html#pyspark.sql.functions.format_string">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">format_string</span><span class="p">(</span><span class="nb">format</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Formats the arguments in printf-style and returns the result as a string column.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> format : str</span> |
| <span class="sd"> string that can contain embedded format tags and used as result column's value</span> |
| <span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column names or :class:`~pyspark.sql.Column`\\s to be used in formatting</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the column of formatted results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(5, "hello")], ['a', 'b'])</span> |
| <span class="sd"> >>> df.select(format_string('%d %s', df.a, df.b).alias('v')).collect()</span> |
| <span class="sd"> [Row(v='5 hello')]</span> |
| <span class="sd"> """</span> |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"format_string"</span><span class="p">,</span> <span class="nb">format</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="instr"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.instr.html#pyspark.sql.functions.instr">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">instr</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">substr</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Locate the position of the first occurrence of substr column in the given string.</span> |
| <span class="sd"> Returns null if either of the arguments are null.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The position is not zero based, but 1 based index. Returns 0 if substr</span> |
| <span class="sd"> could not be found in str.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> substr : str</span> |
| <span class="sd"> substring to look for.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> location of the first occurrence of the substring as integer.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('abcd',)], ['s',])</span> |
| <span class="sd"> >>> df.select(instr(df.s, 'b').alias('s')).collect()</span> |
| <span class="sd"> [Row(s=2)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"instr"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="nb">str</span><span class="p">),</span> <span class="n">substr</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="overlay"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.overlay.html#pyspark.sql.functions.overlay">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">overlay</span><span class="p">(</span> |
| <span class="n">src</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">replace</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">pos</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span> |
| <span class="nb">len</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Overlay the specified portion of `src` with `replace`,</span> |
| <span class="sd"> starting from byte position `pos` of `src` and proceeding for `len` bytes.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> src : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column name or column containing the string that will be replaced</span> |
| <span class="sd"> replace : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column name or column containing the substitution string</span> |
| <span class="sd"> pos : :class:`~pyspark.sql.Column` or str or int</span> |
| <span class="sd"> column name, column, or int containing the starting position in src</span> |
| <span class="sd"> len : :class:`~pyspark.sql.Column` or str or int, optional</span> |
| <span class="sd"> column name, column, or int containing the number of bytes to replace in src</span> |
| <span class="sd"> string by 'replace' defaults to -1, which represents the length of the 'replace' string</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> string with replaced values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("SPARK_SQL", "CORE")], ("x", "y"))</span> |
| <span class="sd"> >>> df.select(overlay("x", "y", 7).alias("overlayed")).collect()</span> |
| <span class="sd"> [Row(overlayed='SPARK_CORE')]</span> |
| <span class="sd"> >>> df.select(overlay("x", "y", 7, 0).alias("overlayed")).collect()</span> |
| <span class="sd"> [Row(overlayed='SPARK_CORESQL')]</span> |
| <span class="sd"> >>> df.select(overlay("x", "y", 7, 2).alias("overlayed")).collect()</span> |
| <span class="sd"> [Row(overlayed='SPARK_COREL')]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pos</span><span class="p">,</span> <span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_COLUMN_OR_INT_OR_STR"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"arg_name"</span><span class="p">:</span> <span class="s2">"pos"</span><span class="p">,</span> <span class="s2">"arg_type"</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">pos</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="nb">len</span><span class="p">,</span> <span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_COLUMN_OR_INT_OR_STR"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"arg_name"</span><span class="p">:</span> <span class="s2">"len"</span><span class="p">,</span> <span class="s2">"arg_type"</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="nb">len</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="n">pos</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">pos</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pos</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">pos</span><span class="p">)</span> |
| <span class="nb">len</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="nb">len</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="nb">len</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="nb">len</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"overlay"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">src</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">replace</span><span class="p">),</span> <span class="n">pos</span><span class="p">,</span> <span class="nb">len</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="sentences"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sentences.html#pyspark.sql.functions.sentences">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">sentences</span><span class="p">(</span> |
| <span class="n">string</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">language</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">country</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Splits a string into arrays of sentences, where each sentence is an array of words.</span> |
| <span class="sd"> The 'language' and 'country' arguments are optional, and if omitted, the default locale is used.</span> |
| |
| <span class="sd"> .. versionadded:: 3.2.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> string : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> a string to be split</span> |
| <span class="sd"> language : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> a language of the locale</span> |
| <span class="sd"> country : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> a country of the locale</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> arrays of split sentences.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([["This is an example sentence."]], ["string"])</span> |
| <span class="sd"> >>> df.select(sentences(df.string, lit("en"), lit("US"))).show(truncate=False)</span> |
| <span class="sd"> +-----------------------------------+</span> |
| <span class="sd"> |sentences(string, en, US) |</span> |
| <span class="sd"> +-----------------------------------+</span> |
| <span class="sd"> |[[This, is, an, example, sentence]]|</span> |
| <span class="sd"> +-----------------------------------+</span> |
| <span class="sd"> >>> df = spark.createDataFrame([["Hello world. How are you?"]], ["s"])</span> |
| <span class="sd"> >>> df.select(sentences("s")).show(truncate=False)</span> |
| <span class="sd"> +---------------------------------+</span> |
| <span class="sd"> |sentences(s, , ) |</span> |
| <span class="sd"> +---------------------------------+</span> |
| <span class="sd"> |[[Hello, world], [How, are, you]]|</span> |
| <span class="sd"> +---------------------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">language</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">language</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">""</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">country</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">country</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">""</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"sentences"</span><span class="p">,</span> <span class="n">string</span><span class="p">,</span> <span class="n">language</span><span class="p">,</span> <span class="n">country</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="substring"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.substring.html#pyspark.sql.functions.substring">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">substring</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">pos</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="nb">len</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Substring starts at `pos` and is of length `len` when str is String type or</span> |
| <span class="sd"> returns the slice of byte array that starts at `pos` in byte and is of length `len`</span> |
| <span class="sd"> when str is Binary type.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The position is not zero based, but 1 based index.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> pos : int</span> |
| <span class="sd"> starting position in str.</span> |
| <span class="sd"> len : int</span> |
| <span class="sd"> length of chars.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> substring of given value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('abcd',)], ['s',])</span> |
| <span class="sd"> >>> df.select(substring(df.s, 1, 2).alias('s')).collect()</span> |
| <span class="sd"> [Row(s='ab')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"substring"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="nb">str</span><span class="p">),</span> <span class="n">pos</span><span class="p">,</span> <span class="nb">len</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="substring_index"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.substring_index.html#pyspark.sql.functions.substring_index">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">substring_index</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">delim</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">count</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the substring from string str before count occurrences of the delimiter delim.</span> |
| <span class="sd"> If count is positive, everything the left of the final delimiter (counting from left) is</span> |
| <span class="sd"> returned. If count is negative, every to the right of the final delimiter (counting from the</span> |
| <span class="sd"> right) is returned. substring_index performs a case-sensitive match when searching for delim.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> delim : str</span> |
| <span class="sd"> delimiter of values.</span> |
| <span class="sd"> count : int</span> |
| <span class="sd"> number of occurrences.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> substring of given value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('a.b.c.d',)], ['s'])</span> |
| <span class="sd"> >>> df.select(substring_index(df.s, '.', 2).alias('s')).collect()</span> |
| <span class="sd"> [Row(s='a.b')]</span> |
| <span class="sd"> >>> df.select(substring_index(df.s, '.', -3).alias('s')).collect()</span> |
| <span class="sd"> [Row(s='b.c.d')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"substring_index"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="nb">str</span><span class="p">),</span> <span class="n">delim</span><span class="p">,</span> <span class="n">count</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="levenshtein"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.levenshtein.html#pyspark.sql.functions.levenshtein">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">levenshtein</span><span class="p">(</span> |
| <span class="n">left</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">right</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">threshold</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Computes the Levenshtein distance of the two given strings.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> left : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> first column value.</span> |
| <span class="sd"> right : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> second column value.</span> |
| <span class="sd"> threshold : int, optional</span> |
| <span class="sd"> if set when the levenshtein distance of the two given strings</span> |
| <span class="sd"> less than or equal to a given threshold then return result distance, or -1</span> |
| |
| <span class="sd"> .. versionchanged: 3.5.0</span> |
| <span class="sd"> Added ``threshold`` argument.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> Levenshtein distance as integer value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df0 = spark.createDataFrame([('kitten', 'sitting',)], ['l', 'r'])</span> |
| <span class="sd"> >>> df0.select(levenshtein('l', 'r').alias('d')).collect()</span> |
| <span class="sd"> [Row(d=3)]</span> |
| <span class="sd"> >>> df0.select(levenshtein('l', 'r', 2).alias('d')).collect()</span> |
| <span class="sd"> [Row(d=-1)]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">threshold</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"levenshtein"</span><span class="p">,</span> <span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span> |
| <span class="s2">"levenshtein"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">left</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">right</span><span class="p">),</span> <span class="n">threshold</span> |
| <span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="locate"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.locate.html#pyspark.sql.functions.locate">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">locate</span><span class="p">(</span><span class="n">substr</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">pos</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Locate the position of the first occurrence of substr in a string column, after position pos.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> substr : str</span> |
| <span class="sd"> a string</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> a Column of :class:`pyspark.sql.types.StringType`</span> |
| <span class="sd"> pos : int, optional</span> |
| <span class="sd"> start position (zero based)</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> position of the substring.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The position is not zero based, but 1 based index. Returns 0 if substr</span> |
| <span class="sd"> could not be found in str.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('abcd',)], ['s',])</span> |
| <span class="sd"> >>> df.select(locate('b', df.s, 1).alias('s')).collect()</span> |
| <span class="sd"> [Row(s=2)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"locate"</span><span class="p">,</span> <span class="n">substr</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="nb">str</span><span class="p">),</span> <span class="n">pos</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="lpad"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.lpad.html#pyspark.sql.functions.lpad">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">lpad</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">len</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">pad</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Left-pad the string column to width `len` with `pad`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> len : int</span> |
| <span class="sd"> length of the final string.</span> |
| <span class="sd"> pad : str</span> |
| <span class="sd"> chars to prepend.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> left padded result.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('abcd',)], ['s',])</span> |
| <span class="sd"> >>> df.select(lpad(df.s, 6, '#').alias('s')).collect()</span> |
| <span class="sd"> [Row(s='##abcd')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"lpad"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="nb">len</span><span class="p">,</span> <span class="n">pad</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="rpad"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.rpad.html#pyspark.sql.functions.rpad">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">rpad</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">len</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">pad</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Right-pad the string column to width `len` with `pad`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> len : int</span> |
| <span class="sd"> length of the final string.</span> |
| <span class="sd"> pad : str</span> |
| <span class="sd"> chars to append.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> right padded result.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('abcd',)], ['s',])</span> |
| <span class="sd"> >>> df.select(rpad(df.s, 6, '#').alias('s')).collect()</span> |
| <span class="sd"> [Row(s='abcd##')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"rpad"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="nb">len</span><span class="p">,</span> <span class="n">pad</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="repeat"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.repeat.html#pyspark.sql.functions.repeat">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">repeat</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Repeats a string column n times, and returns it as a new string column.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> n : int</span> |
| <span class="sd"> number of times to repeat value.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> string with repeated values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('ab',)], ['s',])</span> |
| <span class="sd"> >>> df.select(repeat(df.s, 3).alias('s')).collect()</span> |
| <span class="sd"> [Row(s='ababab')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"repeat"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">n</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="split"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.split.html#pyspark.sql.functions.split">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">split</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">pattern</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">limit</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Splits str around matches of the given pattern.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> a string expression to split</span> |
| <span class="sd"> pattern : str</span> |
| <span class="sd"> a string representing a regular expression. The regex string should be</span> |
| <span class="sd"> a Java regular expression.</span> |
| <span class="sd"> limit : int, optional</span> |
| <span class="sd"> an integer which controls the number of times `pattern` is applied.</span> |
| |
| <span class="sd"> * ``limit > 0``: The resulting array's length will not be more than `limit`, and the</span> |
| <span class="sd"> resulting array's last entry will contain all input beyond the last</span> |
| <span class="sd"> matched pattern.</span> |
| <span class="sd"> * ``limit <= 0``: `pattern` will be applied as many times as possible, and the resulting</span> |
| <span class="sd"> array can be of any size.</span> |
| |
| <span class="sd"> .. versionchanged:: 3.0</span> |
| <span class="sd"> `split` now takes an optional `limit` field. If not provided, default limit value is -1.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> array of separated strings.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('oneAtwoBthreeC',)], ['s',])</span> |
| <span class="sd"> >>> df.select(split(df.s, '[ABC]', 2).alias('s')).collect()</span> |
| <span class="sd"> [Row(s=['one', 'twoBthreeC'])]</span> |
| <span class="sd"> >>> df.select(split(df.s, '[ABC]', -1).alias('s')).collect()</span> |
| <span class="sd"> [Row(s=['one', 'two', 'three', ''])]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"split"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="nb">str</span><span class="p">),</span> <span class="n">pattern</span><span class="p">,</span> <span class="n">limit</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="rlike"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.rlike.html#pyspark.sql.functions.rlike">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">rlike</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">regexp</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sa">r</span><span class="sd">"""Returns true if `str` matches the Java regex `regexp`, or false otherwise.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> regexp : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> regex pattern to apply.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> true if `str` matches a Java regex, or false otherwise.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("1a 2b 14m", r"(\d+)")], ["str", "regexp"])</span> |
| <span class="sd"> >>> df.select(rlike('str', lit(r'(\d+)')).alias('d')).collect()</span> |
| <span class="sd"> [Row(d=True)]</span> |
| <span class="sd"> >>> df.select(rlike('str', lit(r'\d{2}b')).alias('d')).collect()</span> |
| <span class="sd"> [Row(d=False)]</span> |
| <span class="sd"> >>> df.select(rlike("str", col("regexp")).alias('d')).collect()</span> |
| <span class="sd"> [Row(d=True)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"rlike"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">regexp</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="regexp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regexp.html#pyspark.sql.functions.regexp">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">regexp</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">regexp</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sa">r</span><span class="sd">"""Returns true if `str` matches the Java regex `regexp`, or false otherwise.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> regexp : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> regex pattern to apply.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> true if `str` matches a Java regex, or false otherwise.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]</span> |
| <span class="sd"> ... ).select(sf.regexp('str', sf.lit(r'(\d+)'))).show()</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> |REGEXP(str, (\d+))|</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> | true|</span> |
| <span class="sd"> +------------------+</span> |
| |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]</span> |
| <span class="sd"> ... ).select(sf.regexp('str', sf.lit(r'\d{2}b'))).show()</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> |REGEXP(str, \d{2}b)|</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> | false|</span> |
| <span class="sd"> +-------------------+</span> |
| |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]</span> |
| <span class="sd"> ... ).select(sf.regexp('str', sf.col("regexp"))).show()</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> |REGEXP(str, regexp)|</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> | true|</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"regexp"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">regexp</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="regexp_like"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regexp_like.html#pyspark.sql.functions.regexp_like">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">regexp_like</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">regexp</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sa">r</span><span class="sd">"""Returns true if `str` matches the Java regex `regexp`, or false otherwise.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> regexp : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> regex pattern to apply.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> true if `str` matches a Java regex, or false otherwise.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]</span> |
| <span class="sd"> ... ).select(sf.regexp_like('str', sf.lit(r'(\d+)'))).show()</span> |
| <span class="sd"> +-----------------------+</span> |
| <span class="sd"> |REGEXP_LIKE(str, (\d+))|</span> |
| <span class="sd"> +-----------------------+</span> |
| <span class="sd"> | true|</span> |
| <span class="sd"> +-----------------------+</span> |
| |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]</span> |
| <span class="sd"> ... ).select(sf.regexp_like('str', sf.lit(r'\d{2}b'))).show()</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> |REGEXP_LIKE(str, \d{2}b)|</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> | false|</span> |
| <span class="sd"> +------------------------+</span> |
| |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [("1a 2b 14m", r"(\d+)")], ["str", "regexp"]</span> |
| <span class="sd"> ... ).select(sf.regexp_like('str', sf.col("regexp"))).show()</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> |REGEXP_LIKE(str, regexp)|</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> | true|</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"regexp_like"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">regexp</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="regexp_count"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regexp_count.html#pyspark.sql.functions.regexp_count">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">regexp_count</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">regexp</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sa">r</span><span class="sd">"""Returns a count of the number of times that the Java regex pattern `regexp` is matched</span> |
| <span class="sd"> in the string `str`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> regexp : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> regex pattern to apply.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the number of times that a Java regex pattern is matched in the string.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("1a 2b 14m", r"\d+")], ["str", "regexp"])</span> |
| <span class="sd"> >>> df.select(regexp_count('str', lit(r'\d+')).alias('d')).collect()</span> |
| <span class="sd"> [Row(d=3)]</span> |
| <span class="sd"> >>> df.select(regexp_count('str', lit(r'mmm')).alias('d')).collect()</span> |
| <span class="sd"> [Row(d=0)]</span> |
| <span class="sd"> >>> df.select(regexp_count("str", col("regexp")).alias('d')).collect()</span> |
| <span class="sd"> [Row(d=3)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"regexp_count"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">regexp</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="regexp_extract"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regexp_extract.html#pyspark.sql.functions.regexp_extract">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">regexp_extract</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">pattern</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">idx</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sa">r</span><span class="sd">"""Extract a specific group matched by the Java regex `regexp`, from the specified string column.</span> |
| <span class="sd"> If the regex did not match, or the specified group did not match, an empty string is returned.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> pattern : str</span> |
| <span class="sd"> regex pattern to apply.</span> |
| <span class="sd"> idx : int</span> |
| <span class="sd"> matched group id.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> matched value specified by `idx` group id.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('100-200',)], ['str'])</span> |
| <span class="sd"> >>> df.select(regexp_extract('str', r'(\d+)-(\d+)', 1).alias('d')).collect()</span> |
| <span class="sd"> [Row(d='100')]</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('foo',)], ['str'])</span> |
| <span class="sd"> >>> df.select(regexp_extract('str', r'(\d+)', 1).alias('d')).collect()</span> |
| <span class="sd"> [Row(d='')]</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('aaaac',)], ['str'])</span> |
| <span class="sd"> >>> df.select(regexp_extract('str', '(a+)(b)?(c)', 2).alias('d')).collect()</span> |
| <span class="sd"> [Row(d='')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"regexp_extract"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="nb">str</span><span class="p">),</span> <span class="n">pattern</span><span class="p">,</span> <span class="n">idx</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="regexp_extract_all"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regexp_extract_all.html#pyspark.sql.functions.regexp_extract_all">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">regexp_extract_all</span><span class="p">(</span> |
| <span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">regexp</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">idx</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Column</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sa">r</span><span class="sd">"""Extract all strings in the `str` that match the Java regex `regexp`</span> |
| <span class="sd"> and corresponding to the regex group index.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> regexp : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> regex pattern to apply.</span> |
| <span class="sd"> idx : int</span> |
| <span class="sd"> matched group id.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> all strings in the `str` that match a Java regex and corresponding to the regex group index.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("100-200, 300-400", r"(\d+)-(\d+)")], ["str", "regexp"])</span> |
| <span class="sd"> >>> df.select(regexp_extract_all('str', lit(r'(\d+)-(\d+)')).alias('d')).collect()</span> |
| <span class="sd"> [Row(d=['100', '300'])]</span> |
| <span class="sd"> >>> df.select(regexp_extract_all('str', lit(r'(\d+)-(\d+)'), 1).alias('d')).collect()</span> |
| <span class="sd"> [Row(d=['100', '300'])]</span> |
| <span class="sd"> >>> df.select(regexp_extract_all('str', lit(r'(\d+)-(\d+)'), 2).alias('d')).collect()</span> |
| <span class="sd"> [Row(d=['200', '400'])]</span> |
| <span class="sd"> >>> df.select(regexp_extract_all('str', col("regexp")).alias('d')).collect()</span> |
| <span class="sd"> [Row(d=['100', '300'])]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">idx</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"regexp_extract_all"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">regexp</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">idx</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">idx</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">idx</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">idx</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"regexp_extract_all"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">regexp</span><span class="p">,</span> <span class="n">idx</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="regexp_replace"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regexp_replace.html#pyspark.sql.functions.regexp_replace">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">regexp_replace</span><span class="p">(</span> |
| <span class="n">string</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">pattern</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">replacement</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sa">r</span><span class="sd">"""Replace all substrings of the specified string value that match regexp with replacement.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> string : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column name or column containing the string value</span> |
| <span class="sd"> pattern : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column object or str containing the regexp pattern</span> |
| <span class="sd"> replacement : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column object or str containing the replacement</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> string with all substrings replaced.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("100-200", r"(\d+)", "--")], ["str", "pattern", "replacement"])</span> |
| <span class="sd"> >>> df.select(regexp_replace('str', r'(\d+)', '--').alias('d')).collect()</span> |
| <span class="sd"> [Row(d='-----')]</span> |
| <span class="sd"> >>> df.select(regexp_replace("str", col("pattern"), col("replacement")).alias('d')).collect()</span> |
| <span class="sd"> [Row(d='-----')]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pattern</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">pattern_col</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">pattern</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">pattern_col</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">pattern</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">replacement</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">replacement_col</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">replacement</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">replacement_col</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">replacement</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"regexp_replace"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">string</span><span class="p">),</span> <span class="n">pattern_col</span><span class="p">,</span> <span class="n">replacement_col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="regexp_substr"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regexp_substr.html#pyspark.sql.functions.regexp_substr">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">regexp_substr</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">regexp</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sa">r</span><span class="sd">"""Returns the substring that matches the Java regex `regexp` within the string `str`.</span> |
| <span class="sd"> If the regular expression is not found, the result is null.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> regexp : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> regex pattern to apply.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the substring that matches a Java regex within the string `str`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("1a 2b 14m", r"\d+")], ["str", "regexp"])</span> |
| <span class="sd"> >>> df.select(regexp_substr('str', lit(r'\d+')).alias('d')).collect()</span> |
| <span class="sd"> [Row(d='1')]</span> |
| <span class="sd"> >>> df.select(regexp_substr('str', lit(r'mmm')).alias('d')).collect()</span> |
| <span class="sd"> [Row(d=None)]</span> |
| <span class="sd"> >>> df.select(regexp_substr("str", col("regexp")).alias('d')).collect()</span> |
| <span class="sd"> [Row(d='1')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"regexp_substr"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">regexp</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="regexp_instr"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.regexp_instr.html#pyspark.sql.functions.regexp_instr">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">regexp_instr</span><span class="p">(</span> |
| <span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">regexp</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">idx</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Column</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sa">r</span><span class="sd">"""Extract all strings in the `str` that match the Java regex `regexp`</span> |
| <span class="sd"> and corresponding to the regex group index.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> regexp : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> regex pattern to apply.</span> |
| <span class="sd"> idx : int</span> |
| <span class="sd"> matched group id.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> all strings in the `str` that match a Java regex and corresponding to the regex group index.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("1a 2b 14m", r"\d+(a|b|m)")], ["str", "regexp"])</span> |
| <span class="sd"> >>> df.select(regexp_instr('str', lit(r'\d+(a|b|m)')).alias('d')).collect()</span> |
| <span class="sd"> [Row(d=1)]</span> |
| <span class="sd"> >>> df.select(regexp_instr('str', lit(r'\d+(a|b|m)'), 1).alias('d')).collect()</span> |
| <span class="sd"> [Row(d=1)]</span> |
| <span class="sd"> >>> df.select(regexp_instr('str', lit(r'\d+(a|b|m)'), 2).alias('d')).collect()</span> |
| <span class="sd"> [Row(d=1)]</span> |
| <span class="sd"> >>> df.select(regexp_instr('str', col("regexp")).alias('d')).collect()</span> |
| <span class="sd"> [Row(d=1)]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">idx</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"regexp_instr"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">regexp</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">idx</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">idx</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">idx</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">idx</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"regexp_instr"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">regexp</span><span class="p">,</span> <span class="n">idx</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="initcap"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.initcap.html#pyspark.sql.functions.initcap">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">initcap</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Translate the first letter of each word to upper case in the sentence.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> string with all first letters are uppercase in each word.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.createDataFrame([('ab cd',)], ['a']).select(initcap("a").alias('v')).collect()</span> |
| <span class="sd"> [Row(v='Ab Cd')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"initcap"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="soundex"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.soundex.html#pyspark.sql.functions.soundex">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">soundex</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the SoundEx encoding for a string</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> SoundEx encoded string.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("Peters",),("Uhrbach",)], ['name'])</span> |
| <span class="sd"> >>> df.select(soundex(df.name).alias("soundex")).collect()</span> |
| <span class="sd"> [Row(soundex='P362'), Row(soundex='U612')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"soundex"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="bin"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bin.html#pyspark.sql.functions.bin">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">bin</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the string representation of the binary value of the given column.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> binary representation of given value as string.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([2,5], "INT")</span> |
| <span class="sd"> >>> df.select(bin(df.value).alias('c')).collect()</span> |
| <span class="sd"> [Row(c='10'), Row(c='101')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"bin"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="hex"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.hex.html#pyspark.sql.functions.hex">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">hex</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Computes hex value of the given column, which could be :class:`pyspark.sql.types.StringType`,</span> |
| <span class="sd"> :class:`pyspark.sql.types.BinaryType`, :class:`pyspark.sql.types.IntegerType` or</span> |
| <span class="sd"> :class:`pyspark.sql.types.LongType`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> hexadecimal representation of given value as string.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.createDataFrame([('ABC', 3)], ['a', 'b']).select(hex('a'), hex('b')).collect()</span> |
| <span class="sd"> [Row(hex(a)='414243', hex(b)='3')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"hex"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="unhex"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.unhex.html#pyspark.sql.functions.unhex">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">unhex</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Inverse of hex. Interprets each pair of characters as a hexadecimal number</span> |
| <span class="sd"> and converts to the byte representation of number.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> string representation of given hexadecimal value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.createDataFrame([('414243',)], ['a']).select(unhex('a')).collect()</span> |
| <span class="sd"> [Row(unhex(a)=bytearray(b'ABC'))]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"unhex"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="length"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.length.html#pyspark.sql.functions.length">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">length</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Computes the character length of string data or number of bytes of binary data.</span> |
| <span class="sd"> The length of character data includes the trailing spaces. The length of binary data</span> |
| <span class="sd"> includes binary zeros.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> length of the value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.createDataFrame([('ABC ',)], ['a']).select(length('a').alias('length')).collect()</span> |
| <span class="sd"> [Row(length=4)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"length"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="octet_length"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.octet_length.html#pyspark.sql.functions.octet_length">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">octet_length</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Calculates the byte length for the specified string column.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Source column or strings</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> Byte length of the col</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.functions import octet_length</span> |
| <span class="sd"> >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']) \\</span> |
| <span class="sd"> ... .select(octet_length('cat')).collect()</span> |
| <span class="sd"> [Row(octet_length(cat)=3), Row(octet_length(cat)=4)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"octet_length"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="bit_length"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bit_length.html#pyspark.sql.functions.bit_length">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">bit_length</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Calculates the bit length for the specified string column.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Source column or strings</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> Bit length of the col</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.functions import bit_length</span> |
| <span class="sd"> >>> spark.createDataFrame([('cat',), ( '\U0001F408',)], ['cat']) \\</span> |
| <span class="sd"> ... .select(bit_length('cat')).collect()</span> |
| <span class="sd"> [Row(bit_length(cat)=24), Row(bit_length(cat)=32)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"bit_length"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="translate"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.translate.html#pyspark.sql.functions.translate">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">translate</span><span class="p">(</span><span class="n">srcCol</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">matching</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">replace</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""A function translate any character in the `srcCol` by a character in `matching`.</span> |
| <span class="sd"> The characters in `replace` is corresponding to the characters in `matching`.</span> |
| <span class="sd"> Translation will happen whenever any character in the string is matching with the character</span> |
| <span class="sd"> in the `matching`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> srcCol : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Source column or strings</span> |
| <span class="sd"> matching : str</span> |
| <span class="sd"> matching characters.</span> |
| <span class="sd"> replace : str</span> |
| <span class="sd"> characters for replacement. If this is shorter than `matching` string then</span> |
| <span class="sd"> those chars that don't have replacement will be dropped.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> replaced value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.createDataFrame([('translate',)], ['a']).select(translate('a', "rnlt", "123") \\</span> |
| <span class="sd"> ... .alias('r')).collect()</span> |
| <span class="sd"> [Row(r='1a2s3ae')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"translate"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">srcCol</span><span class="p">),</span> <span class="n">matching</span><span class="p">,</span> <span class="n">replace</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="to_binary"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_binary.html#pyspark.sql.functions.to_binary">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">to_binary</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Converts the input `col` to a binary value based on the supplied `format`.</span> |
| <span class="sd"> The `format` can be a case-insensitive string literal of "hex", "utf-8", "utf8",</span> |
| <span class="sd"> or "base64". By default, the binary format for conversion is "hex" if</span> |
| <span class="sd"> `format` is omitted. The function returns NULL if at least one of the</span> |
| <span class="sd"> input parameters is NULL.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input column or strings.</span> |
| <span class="sd"> format : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> format to use to convert binary values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("abc",)], ["e"])</span> |
| <span class="sd"> >>> df.select(to_binary(df.e, lit("utf-8")).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=bytearray(b'abc'))]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([("414243",)], ["e"])</span> |
| <span class="sd"> >>> df.select(to_binary(df.e).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=bytearray(b'ABC'))]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">format</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"to_binary"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="nb">format</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"to_binary"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="to_char"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_char.html#pyspark.sql.functions.to_char">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">to_char</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Convert `col` to a string based on the `format`.</span> |
| <span class="sd"> Throws an exception if the conversion fails. The format can consist of the following</span> |
| <span class="sd"> characters, case insensitive:</span> |
| <span class="sd"> '0' or '9': Specifies an expected digit between 0 and 9. A sequence of 0 or 9 in the</span> |
| <span class="sd"> format string matches a sequence of digits in the input value, generating a result</span> |
| <span class="sd"> string of the same length as the corresponding sequence in the format string.</span> |
| <span class="sd"> The result string is left-padded with zeros if the 0/9 sequence comprises more digits</span> |
| <span class="sd"> than the matching part of the decimal value, starts with 0, and is before the decimal</span> |
| <span class="sd"> point. Otherwise, it is padded with spaces.</span> |
| <span class="sd"> '.' or 'D': Specifies the position of the decimal point (optional, only allowed once).</span> |
| <span class="sd"> ',' or 'G': Specifies the position of the grouping (thousands) separator (,).</span> |
| <span class="sd"> There must be a 0 or 9 to the left and right of each grouping separator.</span> |
| <span class="sd"> '$': Specifies the location of the $ currency sign. This character may only be specified once.</span> |
| <span class="sd"> 'S' or 'MI': Specifies the position of a '-' or '+' sign (optional, only allowed once at</span> |
| <span class="sd"> the beginning or end of the format string). Note that 'S' prints '+' for positive</span> |
| <span class="sd"> values but 'MI' prints a space.</span> |
| <span class="sd"> 'PR': Only allowed at the end of the format string; specifies that the result string</span> |
| <span class="sd"> will be wrapped by angle brackets if the input value is negative.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input column or strings.</span> |
| <span class="sd"> format : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> format to use to convert char values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(78.12,)], ["e"])</span> |
| <span class="sd"> >>> df.select(to_char(df.e, lit("$99.99")).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='$78.12')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"to_char"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="nb">format</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="to_varchar"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_varchar.html#pyspark.sql.functions.to_varchar">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">to_varchar</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Convert `col` to a string based on the `format`.</span> |
| <span class="sd"> Throws an exception if the conversion fails. The format can consist of the following</span> |
| <span class="sd"> characters, case insensitive:</span> |
| <span class="sd"> '0' or '9': Specifies an expected digit between 0 and 9. A sequence of 0 or 9 in the</span> |
| <span class="sd"> format string matches a sequence of digits in the input value, generating a result</span> |
| <span class="sd"> string of the same length as the corresponding sequence in the format string.</span> |
| <span class="sd"> The result string is left-padded with zeros if the 0/9 sequence comprises more digits</span> |
| <span class="sd"> than the matching part of the decimal value, starts with 0, and is before the decimal</span> |
| <span class="sd"> point. Otherwise, it is padded with spaces.</span> |
| <span class="sd"> '.' or 'D': Specifies the position of the decimal point (optional, only allowed once).</span> |
| <span class="sd"> ',' or 'G': Specifies the position of the grouping (thousands) separator (,).</span> |
| <span class="sd"> There must be a 0 or 9 to the left and right of each grouping separator.</span> |
| <span class="sd"> '$': Specifies the location of the $ currency sign. This character may only be specified once.</span> |
| <span class="sd"> 'S' or 'MI': Specifies the position of a '-' or '+' sign (optional, only allowed once at</span> |
| <span class="sd"> the beginning or end of the format string). Note that 'S' prints '+' for positive</span> |
| <span class="sd"> values but 'MI' prints a space.</span> |
| <span class="sd"> 'PR': Only allowed at the end of the format string; specifies that the result string</span> |
| <span class="sd"> will be wrapped by angle brackets if the input value is negative.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input column or strings.</span> |
| <span class="sd"> format : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> format to use to convert char values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(78.12,)], ["e"])</span> |
| <span class="sd"> >>> df.select(to_varchar(df.e, lit("$99.99")).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='$78.12')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"to_varchar"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="nb">format</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="to_number"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_number.html#pyspark.sql.functions.to_number">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">to_number</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Convert string 'col' to a number based on the string format 'format'.</span> |
| <span class="sd"> Throws an exception if the conversion fails. The format can consist of the following</span> |
| <span class="sd"> characters, case insensitive:</span> |
| <span class="sd"> '0' or '9': Specifies an expected digit between 0 and 9. A sequence of 0 or 9 in the</span> |
| <span class="sd"> format string matches a sequence of digits in the input string. If the 0/9</span> |
| <span class="sd"> sequence starts with 0 and is before the decimal point, it can only match a digit</span> |
| <span class="sd"> sequence of the same size. Otherwise, if the sequence starts with 9 or is after</span> |
| <span class="sd"> the decimal point, it can match a digit sequence that has the same or smaller size.</span> |
| <span class="sd"> '.' or 'D': Specifies the position of the decimal point (optional, only allowed once).</span> |
| <span class="sd"> ',' or 'G': Specifies the position of the grouping (thousands) separator (,).</span> |
| <span class="sd"> There must be a 0 or 9 to the left and right of each grouping separator.</span> |
| <span class="sd"> 'col' must match the grouping separator relevant for the size of the number.</span> |
| <span class="sd"> '$': Specifies the location of the $ currency sign. This character may only be</span> |
| <span class="sd"> specified once.</span> |
| <span class="sd"> 'S' or 'MI': Specifies the position of a '-' or '+' sign (optional, only allowed</span> |
| <span class="sd"> once at the beginning or end of the format string). Note that 'S' allows '-'</span> |
| <span class="sd"> but 'MI' does not.</span> |
| <span class="sd"> 'PR': Only allowed at the end of the format string; specifies that 'col' indicates a</span> |
| <span class="sd"> negative number with wrapping angled brackets.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input column or strings.</span> |
| <span class="sd"> format : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> format to use to convert number values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("$78.12",)], ["e"])</span> |
| <span class="sd"> >>> df.select(to_number(df.e, lit("$99.99")).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=Decimal('78.12'))]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"to_number"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="nb">format</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="replace"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.replace.html#pyspark.sql.functions.replace">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">replace</span><span class="p">(</span> |
| <span class="n">src</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">search</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">replace</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Replaces all occurrences of `search` with `replace`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> src : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A column of string to be replaced.</span> |
| <span class="sd"> search : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A column of string, If `search` is not found in `str`, `str` is returned unchanged.</span> |
| <span class="sd"> replace : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> A column of string, If `replace` is not specified or is an empty string,</span> |
| <span class="sd"> nothing replaces the string that is removed from `str`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("ABCabc", "abc", "DEF",)], ["a", "b", "c"])</span> |
| <span class="sd"> >>> df.select(replace(df.a, df.b, df.c).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='ABCDEF')]</span> |
| |
| <span class="sd"> >>> df.select(replace(df.a, df.b).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='ABC')]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">replace</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"replace"</span><span class="p">,</span> <span class="n">src</span><span class="p">,</span> <span class="n">search</span><span class="p">,</span> <span class="n">replace</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"replace"</span><span class="p">,</span> <span class="n">src</span><span class="p">,</span> <span class="n">search</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="split_part"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.split_part.html#pyspark.sql.functions.split_part">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">split_part</span><span class="p">(</span><span class="n">src</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">delimiter</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">partNum</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Splits `str` by delimiter and return requested part of the split (1-based).</span> |
| <span class="sd"> If any input is null, returns null. if `partNum` is out of range of split parts,</span> |
| <span class="sd"> returns empty string. If `partNum` is 0, throws an error. If `partNum` is negative,</span> |
| <span class="sd"> the parts are counted backward from the end of the string.</span> |
| <span class="sd"> If the `delimiter` is an empty string, the `str` is not split.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> src : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A column of string to be splited.</span> |
| <span class="sd"> delimiter : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A column of string, the delimiter used for split.</span> |
| <span class="sd"> partNum : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A column of string, requested part of the split (1-based).</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("11.12.13", ".", 3,)], ["a", "b", "c"])</span> |
| <span class="sd"> >>> df.select(split_part(df.a, df.b, df.c).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='13')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"split_part"</span><span class="p">,</span> <span class="n">src</span><span class="p">,</span> <span class="n">delimiter</span><span class="p">,</span> <span class="n">partNum</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="substr"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.substr.html#pyspark.sql.functions.substr">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">substr</span><span class="p">(</span> |
| <span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">pos</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">len</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the substring of `str` that starts at `pos` and is of length `len`,</span> |
| <span class="sd"> or the slice of byte array that starts at `pos` and is of length `len`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> src : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A column of string.</span> |
| <span class="sd"> pos : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A column of string, the substring of `str` that starts at `pos`.</span> |
| <span class="sd"> len : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> A column of string, the substring of `str` is of length `len`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [("Spark SQL", 5, 1,)], ["a", "b", "c"]</span> |
| <span class="sd"> ... ).select(sf.substr("a", "b", "c")).show()</span> |
| <span class="sd"> +---------------+</span> |
| <span class="sd"> |substr(a, b, c)|</span> |
| <span class="sd"> +---------------+</span> |
| <span class="sd"> | k|</span> |
| <span class="sd"> +---------------+</span> |
| |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [("Spark SQL", 5, 1,)], ["a", "b", "c"]</span> |
| <span class="sd"> ... ).select(sf.substr("a", "b")).show()</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> |substr(a, b, 2147483647)|</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> | k SQL|</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">len</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"substr"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">pos</span><span class="p">,</span> <span class="nb">len</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"substr"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">pos</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="parse_url"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.parse_url.html#pyspark.sql.functions.parse_url">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">parse_url</span><span class="p">(</span> |
| <span class="n">url</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">partToExtract</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Extracts a part from a URL.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> url : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A column of string.</span> |
| <span class="sd"> partToExtract : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A column of string, the path.</span> |
| <span class="sd"> key : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> A column of string, the key.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [("http://spark.apache.org/path?query=1", "QUERY", "query",)],</span> |
| <span class="sd"> ... ["a", "b", "c"]</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.select(parse_url(df.a, df.b, df.c).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='1')]</span> |
| |
| <span class="sd"> >>> df.select(parse_url(df.a, df.b).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='query=1')]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">key</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"parse_url"</span><span class="p">,</span> <span class="n">url</span><span class="p">,</span> <span class="n">partToExtract</span><span class="p">,</span> <span class="n">key</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"parse_url"</span><span class="p">,</span> <span class="n">url</span><span class="p">,</span> <span class="n">partToExtract</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="printf"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.printf.html#pyspark.sql.functions.printf">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">printf</span><span class="p">(</span><span class="nb">format</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Formats the arguments in printf-style and returns the result as a string column.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> format : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> string that can contain embedded format tags and used as result column's value</span> |
| <span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column names or :class:`~pyspark.sql.Column`\\s to be used in formatting</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [("aa%d%s", 123, "cc",)], ["a", "b", "c"]</span> |
| <span class="sd"> ... ).select(sf.printf("a", "b", "c")).show()</span> |
| <span class="sd"> +---------------+</span> |
| <span class="sd"> |printf(a, b, c)|</span> |
| <span class="sd"> +---------------+</span> |
| <span class="sd"> | aa123cc|</span> |
| <span class="sd"> +---------------+</span> |
| <span class="sd"> """</span> |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"printf"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="nb">format</span><span class="p">),</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="url_decode"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.url_decode.html#pyspark.sql.functions.url_decode">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">url_decode</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Decodes a `str` in 'application/x-www-form-urlencoded' format</span> |
| <span class="sd"> using a specific encoding scheme.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A column of string to decode.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("https%3A%2F%2Fspark.apache.org",)], ["a"])</span> |
| <span class="sd"> >>> df.select(url_decode(df.a).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='https://spark.apache.org')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"url_decode"</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="url_encode"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.url_encode.html#pyspark.sql.functions.url_encode">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">url_encode</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Translates a string into 'application/x-www-form-urlencoded' format</span> |
| <span class="sd"> using a specific encoding scheme.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A column of string to encode.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("https://spark.apache.org",)], ["a"])</span> |
| <span class="sd"> >>> df.select(url_encode(df.a).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='https%3A%2F%2Fspark.apache.org')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"url_encode"</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="position"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.position.html#pyspark.sql.functions.position">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">position</span><span class="p">(</span> |
| <span class="n">substr</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">start</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the position of the first occurrence of `substr` in `str` after position `start`.</span> |
| <span class="sd"> The given `start` and return value are 1-based.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> substr : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A column of string, substring.</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A column of string.</span> |
| <span class="sd"> start : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> A column of string, start position.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [("bar", "foobarbar", 5,)], ["a", "b", "c"]</span> |
| <span class="sd"> ... ).select(sf.position("a", "b", "c")).show()</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> |position(a, b, c)|</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> | 7|</span> |
| <span class="sd"> +-----------------+</span> |
| |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [("bar", "foobarbar", 5,)], ["a", "b", "c"]</span> |
| <span class="sd"> ... ).select(sf.position("a", "b")).show()</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> |position(a, b, 1)|</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> | 4|</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">start</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"position"</span><span class="p">,</span> <span class="n">substr</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">start</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"position"</span><span class="p">,</span> <span class="n">substr</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="endswith"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.endswith.html#pyspark.sql.functions.endswith">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">endswith</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">suffix</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a boolean. The value is True if str ends with suffix.</span> |
| <span class="sd"> Returns NULL if either input expression is NULL. Otherwise, returns False.</span> |
| <span class="sd"> Both str or suffix must be of STRING or BINARY type.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A column of string.</span> |
| <span class="sd"> suffix : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A column of string, the suffix.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("Spark SQL", "Spark",)], ["a", "b"])</span> |
| <span class="sd"> >>> df.select(endswith(df.a, df.b).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=False)]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([("414243", "4243",)], ["e", "f"])</span> |
| <span class="sd"> >>> df = df.select(to_binary("e").alias("e"), to_binary("f").alias("f"))</span> |
| <span class="sd"> >>> df.printSchema()</span> |
| <span class="sd"> root</span> |
| <span class="sd"> |-- e: binary (nullable = true)</span> |
| <span class="sd"> |-- f: binary (nullable = true)</span> |
| <span class="sd"> >>> df.select(endswith("e", "f"), endswith("f", "e")).show()</span> |
| <span class="sd"> +--------------+--------------+</span> |
| <span class="sd"> |endswith(e, f)|endswith(f, e)|</span> |
| <span class="sd"> +--------------+--------------+</span> |
| <span class="sd"> | true| false|</span> |
| <span class="sd"> +--------------+--------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"endswith"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">suffix</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="startswith"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.startswith.html#pyspark.sql.functions.startswith">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">startswith</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">prefix</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a boolean. The value is True if str starts with prefix.</span> |
| <span class="sd"> Returns NULL if either input expression is NULL. Otherwise, returns False.</span> |
| <span class="sd"> Both str or prefix must be of STRING or BINARY type.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A column of string.</span> |
| <span class="sd"> prefix : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A column of string, the prefix.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("Spark SQL", "Spark",)], ["a", "b"])</span> |
| <span class="sd"> >>> df.select(startswith(df.a, df.b).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=True)]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([("414243", "4142",)], ["e", "f"])</span> |
| <span class="sd"> >>> df = df.select(to_binary("e").alias("e"), to_binary("f").alias("f"))</span> |
| <span class="sd"> >>> df.printSchema()</span> |
| <span class="sd"> root</span> |
| <span class="sd"> |-- e: binary (nullable = true)</span> |
| <span class="sd"> |-- f: binary (nullable = true)</span> |
| <span class="sd"> >>> df.select(startswith("e", "f"), startswith("f", "e")).show()</span> |
| <span class="sd"> +----------------+----------------+</span> |
| <span class="sd"> |startswith(e, f)|startswith(f, e)|</span> |
| <span class="sd"> +----------------+----------------+</span> |
| <span class="sd"> | true| false|</span> |
| <span class="sd"> +----------------+----------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"startswith"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">prefix</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="char"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.char.html#pyspark.sql.functions.char">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">char</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the ASCII character having the binary equivalent to `col`. If col is larger than 256 the</span> |
| <span class="sd"> result is equivalent to char(col % 256)</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input column or strings.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.range(1).select(sf.char(sf.lit(65))).show()</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> |char(65)|</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> | A|</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"char"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="btrim"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.btrim.html#pyspark.sql.functions.btrim">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">btrim</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">trim</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Remove the leading and trailing `trim` characters from `str`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input column or strings.</span> |
| <span class="sd"> trim : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The trim string characters to trim, the default value is a single space</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("SSparkSQLS", "SL", )], ['a', 'b'])</span> |
| <span class="sd"> >>> df.select(btrim(df.a, df.b).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='parkSQ')]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(" SparkSQL ",)], ['a'])</span> |
| <span class="sd"> >>> df.select(btrim(df.a).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='SparkSQL')]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">trim</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"btrim"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">trim</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"btrim"</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="char_length"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.char_length.html#pyspark.sql.functions.char_length">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">char_length</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the character length of string data or number of bytes of binary data.</span> |
| <span class="sd"> The length of string data includes the trailing spaces.</span> |
| <span class="sd"> The length of binary data includes binary zeros.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input column or strings.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.range(1).select(sf.char_length(sf.lit("SparkSQL"))).show()</span> |
| <span class="sd"> +---------------------+</span> |
| <span class="sd"> |char_length(SparkSQL)|</span> |
| <span class="sd"> +---------------------+</span> |
| <span class="sd"> | 8|</span> |
| <span class="sd"> +---------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"char_length"</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="character_length"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.character_length.html#pyspark.sql.functions.character_length">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">character_length</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the character length of string data or number of bytes of binary data.</span> |
| <span class="sd"> The length of string data includes the trailing spaces.</span> |
| <span class="sd"> The length of binary data includes binary zeros.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input column or strings.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.range(1).select(sf.character_length(sf.lit("SparkSQL"))).show()</span> |
| <span class="sd"> +--------------------------+</span> |
| <span class="sd"> |character_length(SparkSQL)|</span> |
| <span class="sd"> +--------------------------+</span> |
| <span class="sd"> | 8|</span> |
| <span class="sd"> +--------------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"character_length"</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="try_to_binary"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_to_binary.html#pyspark.sql.functions.try_to_binary">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">try_to_binary</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> This is a special version of `to_binary` that performs the same operation, but returns a NULL</span> |
| <span class="sd"> value instead of raising an error if the conversion cannot be performed.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input column or strings.</span> |
| <span class="sd"> format : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> format to use to convert binary values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("abc",)], ["e"])</span> |
| <span class="sd"> >>> df.select(try_to_binary(df.e, lit("utf-8")).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=bytearray(b'abc'))]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([("414243",)], ["e"])</span> |
| <span class="sd"> >>> df.select(try_to_binary(df.e).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=bytearray(b'ABC'))]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">format</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"try_to_binary"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="nb">format</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"try_to_binary"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="try_to_number"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_to_number.html#pyspark.sql.functions.try_to_number">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">try_to_number</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">format</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Convert string 'col' to a number based on the string format `format`. Returns NULL if the</span> |
| <span class="sd"> string 'col' does not match the expected format. The format follows the same semantics as the</span> |
| <span class="sd"> to_number function.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input column or strings.</span> |
| <span class="sd"> format : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> format to use to convert number values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("$78.12",)], ["e"])</span> |
| <span class="sd"> >>> df.select(try_to_number(df.e, lit("$99.99")).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=Decimal('78.12'))]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"try_to_number"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="nb">format</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="contains"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.contains.html#pyspark.sql.functions.contains">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">contains</span><span class="p">(</span><span class="n">left</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">right</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a boolean. The value is True if right is found inside left.</span> |
| <span class="sd"> Returns NULL if either input expression is NULL. Otherwise, returns False.</span> |
| <span class="sd"> Both left or right must be of STRING or BINARY type.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> left : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The input column or strings to check, may be NULL.</span> |
| <span class="sd"> right : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The input column or strings to find, may be NULL.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("Spark SQL", "Spark")], ['a', 'b'])</span> |
| <span class="sd"> >>> df.select(contains(df.a, df.b).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=True)]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([("414243", "4243",)], ["c", "d"])</span> |
| <span class="sd"> >>> df = df.select(to_binary("c").alias("c"), to_binary("d").alias("d"))</span> |
| <span class="sd"> >>> df.printSchema()</span> |
| <span class="sd"> root</span> |
| <span class="sd"> |-- c: binary (nullable = true)</span> |
| <span class="sd"> |-- d: binary (nullable = true)</span> |
| <span class="sd"> >>> df.select(contains("c", "d"), contains("d", "c")).show()</span> |
| <span class="sd"> +--------------+--------------+</span> |
| <span class="sd"> |contains(c, d)|contains(d, c)|</span> |
| <span class="sd"> +--------------+--------------+</span> |
| <span class="sd"> | true| false|</span> |
| <span class="sd"> +--------------+--------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"contains"</span><span class="p">,</span> <span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="elt"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.elt.html#pyspark.sql.functions.elt">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">elt</span><span class="p">(</span><span class="o">*</span><span class="n">inputs</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the `n`-th input, e.g., returns `input2` when `n` is 2.</span> |
| <span class="sd"> The function returns NULL if the index exceeds the length of the array</span> |
| <span class="sd"> and `spark.sql.ansi.enabled` is set to false. If `spark.sql.ansi.enabled` is set to true,</span> |
| <span class="sd"> it throws ArrayIndexOutOfBoundsException for invalid indices.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> inputs : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input columns or strings.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, "scala", "java")], ['a', 'b', 'c'])</span> |
| <span class="sd"> >>> df.select(elt(df.a, df.b, df.c).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='scala')]</span> |
| <span class="sd"> """</span> |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"elt"</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">inputs</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="find_in_set"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.find_in_set.html#pyspark.sql.functions.find_in_set">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">find_in_set</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">str_array</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the index (1-based) of the given string (`str`) in the comma-delimited</span> |
| <span class="sd"> list (`strArray`). Returns 0, if the string was not found or if the given string (`str`)</span> |
| <span class="sd"> contains a comma.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The given string to be found.</span> |
| <span class="sd"> str_array : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The comma-delimited list.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("ab", "abc,b,ab,c,def")], ['a', 'b'])</span> |
| <span class="sd"> >>> df.select(find_in_set(df.a, df.b).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=3)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"find_in_set"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">str_array</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="like"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.like.html#pyspark.sql.functions.like">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">like</span><span class="p">(</span> |
| <span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">pattern</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">escapeChar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"Column"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns true if str matches `pattern` with `escape`,</span> |
| <span class="sd"> null if any arguments are null, false otherwise.</span> |
| <span class="sd"> The default escape character is the '\'.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A string.</span> |
| <span class="sd"> pattern : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A string. The pattern is a string which is matched literally, with</span> |
| <span class="sd"> exception to the following special symbols:</span> |
| <span class="sd"> _ matches any one character in the input (similar to . in posix regular expressions)</span> |
| <span class="sd"> % matches zero or more characters in the input (similar to .* in posix regular</span> |
| <span class="sd"> expressions)</span> |
| <span class="sd"> Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order</span> |
| <span class="sd"> to match "\abc", the pattern should be "\\abc".</span> |
| <span class="sd"> When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled, it falls back</span> |
| <span class="sd"> to Spark 1.6 behavior regarding string literal parsing. For example, if the config is</span> |
| <span class="sd"> enabled, the pattern to match "\abc" should be "\abc".</span> |
| <span class="sd"> escape : :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> An character added since Spark 3.0. The default escape character is the '\'.</span> |
| <span class="sd"> If an escape character precedes a special symbol or another escape character, the</span> |
| <span class="sd"> following character is matched literally. It is invalid to escape any other character.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("Spark", "_park")], ['a', 'b'])</span> |
| <span class="sd"> >>> df.select(like(df.a, df.b).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=True)]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [("%SystemDrive%/Users/John", "/%SystemDrive/%//Users%")],</span> |
| <span class="sd"> ... ['a', 'b']</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.select(like(df.a, df.b, lit('/')).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=True)]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">escapeChar</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"like"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">pattern</span><span class="p">,</span> <span class="n">escapeChar</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"like"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">pattern</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="ilike"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.ilike.html#pyspark.sql.functions.ilike">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">ilike</span><span class="p">(</span> |
| <span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">pattern</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">escapeChar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"Column"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns true if str matches `pattern` with `escape` case-insensitively,</span> |
| <span class="sd"> null if any arguments are null, false otherwise.</span> |
| <span class="sd"> The default escape character is the '\'.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A string.</span> |
| <span class="sd"> pattern : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> A string. The pattern is a string which is matched literally, with</span> |
| <span class="sd"> exception to the following special symbols:</span> |
| <span class="sd"> _ matches any one character in the input (similar to . in posix regular expressions)</span> |
| <span class="sd"> % matches zero or more characters in the input (similar to .* in posix regular</span> |
| <span class="sd"> expressions)</span> |
| <span class="sd"> Since Spark 2.0, string literals are unescaped in our SQL parser. For example, in order</span> |
| <span class="sd"> to match "\abc", the pattern should be "\\abc".</span> |
| <span class="sd"> When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled, it falls back</span> |
| <span class="sd"> to Spark 1.6 behavior regarding string literal parsing. For example, if the config is</span> |
| <span class="sd"> enabled, the pattern to match "\abc" should be "\abc".</span> |
| <span class="sd"> escape : :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> An character added since Spark 3.0. The default escape character is the '\'.</span> |
| <span class="sd"> If an escape character precedes a special symbol or another escape character, the</span> |
| <span class="sd"> following character is matched literally. It is invalid to escape any other character.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("Spark", "_park")], ['a', 'b'])</span> |
| <span class="sd"> >>> df.select(ilike(df.a, df.b).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=True)]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [("%SystemDrive%/Users/John", "/%SystemDrive/%//Users%")],</span> |
| <span class="sd"> ... ['a', 'b']</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.select(ilike(df.a, df.b, lit('/')).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=True)]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">escapeChar</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"ilike"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">pattern</span><span class="p">,</span> <span class="n">escapeChar</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"ilike"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">pattern</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="lcase"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.lcase.html#pyspark.sql.functions.lcase">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">lcase</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns `str` with all characters changed to lowercase.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input column or strings.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.range(1).select(sf.lcase(sf.lit("Spark"))).show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |lcase(Spark)|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | spark|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"lcase"</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="ucase"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.ucase.html#pyspark.sql.functions.ucase">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">ucase</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns `str` with all characters changed to uppercase.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input column or strings.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.range(1).select(sf.ucase(sf.lit("Spark"))).show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |ucase(Spark)|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | SPARK|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"ucase"</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="left"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.left.html#pyspark.sql.functions.left">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">left</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">len</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the leftmost `len`(`len` can be string type) characters from the string `str`,</span> |
| <span class="sd"> if `len` is less or equal than 0 the result is an empty string.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input column or strings.</span> |
| <span class="sd"> len : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input column or strings, the leftmost `len`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("Spark SQL", 3,)], ['a', 'b'])</span> |
| <span class="sd"> >>> df.select(left(df.a, df.b).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='Spa')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"left"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="nb">len</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="right"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.right.html#pyspark.sql.functions.right">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">right</span><span class="p">(</span><span class="nb">str</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">len</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the rightmost `len`(`len` can be string type) characters from the string `str`,</span> |
| <span class="sd"> if `len` is less or equal than 0 the result is an empty string.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> str : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input column or strings.</span> |
| <span class="sd"> len : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input column or strings, the rightmost `len`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("Spark SQL", 3,)], ['a', 'b'])</span> |
| <span class="sd"> >>> df.select(right(df.a, df.b).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='SQL')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"right"</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="nb">len</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="mask"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.mask.html#pyspark.sql.functions.mask">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">mask</span><span class="p">(</span> |
| <span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">upperChar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">lowerChar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">digitChar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">otherChar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Masks the given string value. This can be useful for creating copies of tables with sensitive</span> |
| <span class="sd"> information removed.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col: :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| <span class="sd"> upperChar: :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> character to replace upper-case characters with. Specify NULL to retain original character.</span> |
| <span class="sd"> lowerChar: :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> character to replace lower-case characters with. Specify NULL to retain original character.</span> |
| <span class="sd"> digitChar: :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> character to replace digit characters with. Specify NULL to retain original character.</span> |
| <span class="sd"> otherChar: :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> character to replace all other characters with. Specify NULL to retain original character.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("AbCD123-@$#",), ("abcd-EFGH-8765-4321",)], ['data'])</span> |
| <span class="sd"> >>> df.select(mask(df.data).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='XxXXnnn-@$#'), Row(r='xxxx-XXXX-nnnn-nnnn')]</span> |
| <span class="sd"> >>> df.select(mask(df.data, lit('Y')).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='YxYYnnn-@$#'), Row(r='xxxx-YYYY-nnnn-nnnn')]</span> |
| <span class="sd"> >>> df.select(mask(df.data, lit('Y'), lit('y')).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='YyYYnnn-@$#'), Row(r='yyyy-YYYY-nnnn-nnnn')]</span> |
| <span class="sd"> >>> df.select(mask(df.data, lit('Y'), lit('y'), lit('d')).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='YyYYddd-@$#'), Row(r='yyyy-YYYY-dddd-dddd')]</span> |
| <span class="sd"> >>> df.select(mask(df.data, lit('Y'), lit('y'), lit('d'), lit('*')).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='YyYYddd****'), Row(r='yyyy*YYYY*dddd*dddd')]</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_upperChar</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">"X"</span><span class="p">)</span> <span class="k">if</span> <span class="n">upperChar</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">upperChar</span> |
| <span class="n">_lowerChar</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">"x"</span><span class="p">)</span> <span class="k">if</span> <span class="n">lowerChar</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">lowerChar</span> |
| <span class="n">_digitChar</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">"n"</span><span class="p">)</span> <span class="k">if</span> <span class="n">digitChar</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">digitChar</span> |
| <span class="n">_otherChar</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span> <span class="k">if</span> <span class="n">otherChar</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">otherChar</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span> |
| <span class="s2">"mask"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">_upperChar</span><span class="p">,</span> <span class="n">_lowerChar</span><span class="p">,</span> <span class="n">_digitChar</span><span class="p">,</span> <span class="n">_otherChar</span> |
| <span class="p">)</span></div> |
| |
| |
| <span class="c1"># ---------------------- Collection functions ------------------------------</span> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">create_map</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">create_map</span><span class="p">(</span><span class="n">__cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">"ColumnOrName_"</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"ColumnOrName_"</span><span class="p">,</span> <span class="o">...</span><span class="p">]])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <div class="viewcode-block" id="create_map"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.create_map.html#pyspark.sql.functions.create_map">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">create_map</span><span class="p">(</span> |
| <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">"ColumnOrName_"</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"ColumnOrName_"</span><span class="p">,</span> <span class="o">...</span><span class="p">]]]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Creates a new map column.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column names or :class:`~pyspark.sql.Column`\\s that are</span> |
| <span class="sd"> grouped as key-value pairs, e.g. (key1, value1, key2, value2, ...).</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age"))</span> |
| <span class="sd"> >>> df.select(create_map('name', 'age').alias("map")).collect()</span> |
| <span class="sd"> [Row(map={'Alice': 2}), Row(map={'Bob': 5})]</span> |
| <span class="sd"> >>> df.select(create_map([df.name, df.age]).alias("map")).collect()</span> |
| <span class="sd"> [Row(map={'Alice': 2}), Row(map={'Bob': 5})]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">set</span><span class="p">)):</span> |
| <span class="n">cols</span> <span class="o">=</span> <span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="c1"># type: ignore[assignment]</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">"map"</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span> <span class="c1"># type: ignore[arg-type]</span></div> |
| |
| |
| <div class="viewcode-block" id="map_from_arrays"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.map_from_arrays.html#pyspark.sql.functions.map_from_arrays">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">map_from_arrays</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Creates a new map from two arrays.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column containing a set of keys. All elements should not be null</span> |
| <span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column containing a set of values</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a column of map type.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([([2, 5], ['a', 'b'])], ['k', 'v'])</span> |
| <span class="sd"> >>> df = df.select(map_from_arrays(df.k, df.v).alias("col"))</span> |
| <span class="sd"> >>> df.show()</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> | col|</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> |{2 -> a, 5 -> b}|</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> >>> df.printSchema()</span> |
| <span class="sd"> root</span> |
| <span class="sd"> |-- col: map (nullable = true)</span> |
| <span class="sd"> | |-- key: long</span> |
| <span class="sd"> | |-- value: string (valueContainsNull = true)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"map_from_arrays"</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">array</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">array</span><span class="p">(</span><span class="n">__cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">"ColumnOrName_"</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"ColumnOrName_"</span><span class="p">,</span> <span class="o">...</span><span class="p">]])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <div class="viewcode-block" id="array"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array.html#pyspark.sql.functions.array">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">array</span><span class="p">(</span> |
| <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">"ColumnOrName_"</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"ColumnOrName_"</span><span class="p">,</span> <span class="o">...</span><span class="p">]]]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Creates a new array column.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column names or :class:`~pyspark.sql.Column`\\s that have</span> |
| <span class="sd"> the same data type.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a column of array type.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], ("name", "age"))</span> |
| <span class="sd"> >>> df.select(array('age', 'age').alias("arr")).collect()</span> |
| <span class="sd"> [Row(arr=[2, 2]), Row(arr=[5, 5])]</span> |
| <span class="sd"> >>> df.select(array([df.age, df.age]).alias("arr")).collect()</span> |
| <span class="sd"> [Row(arr=[2, 2]), Row(arr=[5, 5])]</span> |
| <span class="sd"> >>> df.select(array('age', 'age').alias("col")).printSchema()</span> |
| <span class="sd"> root</span> |
| <span class="sd"> |-- col: array (nullable = false)</span> |
| <span class="sd"> | |-- element: long (containsNull = true)</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">set</span><span class="p">)):</span> |
| <span class="n">cols</span> <span class="o">=</span> <span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="c1"># type: ignore[assignment]</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">"array"</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span> <span class="c1"># type: ignore[arg-type]</span></div> |
| |
| |
| <div class="viewcode-block" id="array_contains"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_contains.html#pyspark.sql.functions.array_contains">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">array_contains</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: returns null if the array is null, true if the array contains the</span> |
| <span class="sd"> given value, and false otherwise.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column containing array</span> |
| <span class="sd"> value :</span> |
| <span class="sd"> value or column to check for in array</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a column of Boolean type.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(["a", "b", "c"],), ([],)], ['data'])</span> |
| <span class="sd"> >>> df.select(array_contains(df.data, "a")).collect()</span> |
| <span class="sd"> [Row(array_contains(data, a)=True), Row(array_contains(data, a)=False)]</span> |
| <span class="sd"> >>> df.select(array_contains(df.data, lit("a"))).collect()</span> |
| <span class="sd"> [Row(array_contains(data, a)=True), Row(array_contains(data, a)=False)]</span> |
| <span class="sd"> """</span> |
| <span class="n">value</span> <span class="o">=</span> <span class="n">value</span><span class="o">.</span><span class="n">_jc</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> <span class="k">else</span> <span class="n">value</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"array_contains"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">value</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="arrays_overlap"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.arrays_overlap.html#pyspark.sql.functions.arrays_overlap">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">arrays_overlap</span><span class="p">(</span><span class="n">a1</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">a2</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: returns true if the arrays contain any common non-null element; if not,</span> |
| <span class="sd"> returns null if both the arrays are non-empty and any of them contains a null element; returns</span> |
| <span class="sd"> false otherwise.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a column of Boolean type.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(["a", "b"], ["b", "c"]), (["a"], ["b", "c"])], ['x', 'y'])</span> |
| <span class="sd"> >>> df.select(arrays_overlap(df.x, df.y).alias("overlap")).collect()</span> |
| <span class="sd"> [Row(overlap=True), Row(overlap=False)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"arrays_overlap"</span><span class="p">,</span> <span class="n">a1</span><span class="p">,</span> <span class="n">a2</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="slice"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.slice.html#pyspark.sql.functions.slice">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">slice</span><span class="p">(</span> |
| <span class="n">x</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">start</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span> <span class="n">length</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">int</span><span class="p">]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: returns an array containing all the elements in `x` from index `start`</span> |
| <span class="sd"> (array indices start at 1, or from the end if `start` is negative) with the specified `length`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> x : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column name or column containing the array to be sliced</span> |
| <span class="sd"> start : :class:`~pyspark.sql.Column` or str or int</span> |
| <span class="sd"> column name, column, or int containing the starting index</span> |
| <span class="sd"> length : :class:`~pyspark.sql.Column` or str or int</span> |
| <span class="sd"> column name, column, or int containing the length of the slice</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a column of array type. Subset of array.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([([1, 2, 3],), ([4, 5],)], ['x'])</span> |
| <span class="sd"> >>> df.select(slice(df.x, 2, 2).alias("sliced")).collect()</span> |
| <span class="sd"> [Row(sliced=[2, 3]), Row(sliced=[5])]</span> |
| <span class="sd"> """</span> |
| <span class="n">start</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">start</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">start</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">start</span> |
| <span class="n">length</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">length</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">length</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">length</span> |
| |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"slice"</span><span class="p">,</span> <span class="n">x</span><span class="p">,</span> <span class="n">start</span><span class="p">,</span> <span class="n">length</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="array_join"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_join.html#pyspark.sql.functions.array_join">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">array_join</span><span class="p">(</span> |
| <span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">delimiter</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">null_replacement</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Concatenates the elements of `column` using the `delimiter`. Null values are replaced with</span> |
| <span class="sd"> `null_replacement` if set, otherwise they are ignored.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> delimiter : str</span> |
| <span class="sd"> delimiter used to concatenate elements</span> |
| <span class="sd"> null_replacement : str, optional</span> |
| <span class="sd"> if set then null values will be replaced by this value</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a column of string type. Concatenated values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(["a", "b", "c"],), (["a", None],)], ['data'])</span> |
| <span class="sd"> >>> df.select(array_join(df.data, ",").alias("joined")).collect()</span> |
| <span class="sd"> [Row(joined='a,b,c'), Row(joined='a')]</span> |
| <span class="sd"> >>> df.select(array_join(df.data, ",", "NULL").alias("joined")).collect()</span> |
| <span class="sd"> [Row(joined='a,b,c'), Row(joined='a,NULL')]</span> |
| <span class="sd"> """</span> |
| <span class="n">get_active_spark_context</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">null_replacement</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"array_join"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">delimiter</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"array_join"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">delimiter</span><span class="p">,</span> <span class="n">null_replacement</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="concat"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.concat.html#pyspark.sql.functions.concat">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">concat</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Concatenates multiple input columns together into a single column.</span> |
| <span class="sd"> The function works with strings, numeric, binary and compatible array columns.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column or columns to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> concatenated values. Type of the `Column` depends on input columns' type.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> :meth:`pyspark.sql.functions.array_join` : to concatenate string columns with delimiter</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('abcd','123')], ['s', 'd'])</span> |
| <span class="sd"> >>> df = df.select(concat(df.s, df.d).alias('s'))</span> |
| <span class="sd"> >>> df.collect()</span> |
| <span class="sd"> [Row(s='abcd123')]</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> DataFrame[s: string]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([([1, 2], [3, 4], [5]), ([1, 2], None, [3])], ['a', 'b', 'c'])</span> |
| <span class="sd"> >>> df = df.select(concat(df.a, df.b, df.c).alias("arr"))</span> |
| <span class="sd"> >>> df.collect()</span> |
| <span class="sd"> [Row(arr=[1, 2, 3, 4, 5]), Row(arr=None)]</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> DataFrame[arr: array<bigint>]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">"concat"</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="array_position"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_position.html#pyspark.sql.functions.array_position">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">array_position</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: Locates the position of the first occurrence of the given value</span> |
| <span class="sd"> in the given array. Returns null if either of the arguments are null.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The position is not zero based, but 1 based index. Returns 0 if the given</span> |
| <span class="sd"> value could not be found in the array.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| <span class="sd"> value : Any</span> |
| <span class="sd"> value to look for.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> position of the value in the given array if found and 0 otherwise.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(["c", "b", "a"],), ([],)], ['data'])</span> |
| <span class="sd"> >>> df.select(array_position(df.data, "a")).collect()</span> |
| <span class="sd"> [Row(array_position(data, a)=3), Row(array_position(data, a)=0)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"array_position"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">value</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="element_at"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.element_at.html#pyspark.sql.functions.element_at">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">element_at</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">extraction</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: Returns element of array at given index in `extraction` if col is array.</span> |
| <span class="sd"> Returns value for the given key in `extraction` if col is map. If position is negative</span> |
| <span class="sd"> then location of the element will start from end, if number is outside the</span> |
| <span class="sd"> array boundaries then None will be returned.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column containing array or map</span> |
| <span class="sd"> extraction :</span> |
| <span class="sd"> index to check for in array or key to check for in map</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> value at given position.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The position is not zero based, but 1 based index.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> :meth:`get`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data'])</span> |
| <span class="sd"> >>> df.select(element_at(df.data, 1)).collect()</span> |
| <span class="sd"> [Row(element_at(data, 1)='a')]</span> |
| <span class="sd"> >>> df.select(element_at(df.data, -1)).collect()</span> |
| <span class="sd"> [Row(element_at(data, -1)='c')]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},)], ['data'])</span> |
| <span class="sd"> >>> df.select(element_at(df.data, lit("a"))).collect()</span> |
| <span class="sd"> [Row(element_at(data, a)=1.0)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"element_at"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">lit</span><span class="p">(</span><span class="n">extraction</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="try_element_at"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_element_at.html#pyspark.sql.functions.try_element_at">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">try_element_at</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">extraction</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> (array, index) - Returns element of array at given (1-based) index. If Index is 0, Spark will</span> |
| <span class="sd"> throw an error. If index < 0, accesses elements from the last to the first. The function</span> |
| <span class="sd"> always returns NULL if the index exceeds the length of the array.</span> |
| |
| <span class="sd"> (map, key) - Returns value for given key. The function always returns NULL if the key is not</span> |
| <span class="sd"> contained in the map.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column containing array or map</span> |
| <span class="sd"> extraction :</span> |
| <span class="sd"> index to check for in array or key to check for in map</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(["a", "b", "c"],)], ['data'])</span> |
| <span class="sd"> >>> df.select(try_element_at(df.data, lit(1)).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='a')]</span> |
| <span class="sd"> >>> df.select(try_element_at(df.data, lit(-1)).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='c')]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([({"a": 1.0, "b": 2.0},)], ['data'])</span> |
| <span class="sd"> >>> df.select(try_element_at(df.data, lit("a")).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=1.0)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"try_element_at"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">extraction</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="get"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.get.html#pyspark.sql.functions.get">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">get</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">index</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">int</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: Returns element of array at given (0-based) index.</span> |
| <span class="sd"> If the index points outside of the array boundaries, then this function</span> |
| <span class="sd"> returns NULL.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column containing array</span> |
| <span class="sd"> index : :class:`~pyspark.sql.Column` or str or int</span> |
| <span class="sd"> index to check for in array</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> value at given position.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The position is not 1 based, but 0 based index.</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> :meth:`element_at`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(["a", "b", "c"], 1)], ['data', 'index'])</span> |
| <span class="sd"> >>> df.select(get(df.data, 1)).show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |get(data, 1)|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | b|</span> |
| <span class="sd"> +------------+</span> |
| |
| <span class="sd"> >>> df.select(get(df.data, -1)).show()</span> |
| <span class="sd"> +-------------+</span> |
| <span class="sd"> |get(data, -1)|</span> |
| <span class="sd"> +-------------+</span> |
| <span class="sd"> | NULL|</span> |
| <span class="sd"> +-------------+</span> |
| |
| <span class="sd"> >>> df.select(get(df.data, 3)).show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |get(data, 3)|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | NULL|</span> |
| <span class="sd"> +------------+</span> |
| |
| <span class="sd"> >>> df.select(get(df.data, "index")).show()</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> |get(data, index)|</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> | b|</span> |
| <span class="sd"> +----------------+</span> |
| |
| <span class="sd"> >>> df.select(get(df.data, col("index") - 1)).show()</span> |
| <span class="sd"> +----------------------+</span> |
| <span class="sd"> |get(data, (index - 1))|</span> |
| <span class="sd"> +----------------------+</span> |
| <span class="sd"> | a|</span> |
| <span class="sd"> +----------------------+</span> |
| <span class="sd"> """</span> |
| <span class="n">index</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">index</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">index</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">index</span> |
| |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"get"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">index</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="array_prepend"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_prepend.html#pyspark.sql.functions.array_prepend">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">array_prepend</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: Returns an array containing element as</span> |
| <span class="sd"> well as all elements from array. The new element is positioned</span> |
| <span class="sd"> at the beginning of the array.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column containing array</span> |
| <span class="sd"> value :</span> |
| <span class="sd"> a literal value, or a :class:`~pyspark.sql.Column` expression.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> an array excluding given value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([([2, 3, 4],), ([],)], ['data'])</span> |
| <span class="sd"> >>> df.select(array_prepend(df.data, 1)).collect()</span> |
| <span class="sd"> [Row(array_prepend(data, 1)=[1, 2, 3, 4]), Row(array_prepend(data, 1)=[1])]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"array_prepend"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">lit</span><span class="p">(</span><span class="n">value</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="array_remove"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_remove.html#pyspark.sql.functions.array_remove">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">array_remove</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">element</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: Remove all elements that equal to element from the given array.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column containing array</span> |
| <span class="sd"> element :</span> |
| <span class="sd"> element to be removed from the array</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> an array excluding given value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([([1, 2, 3, 1, 1],), ([],)], ['data'])</span> |
| <span class="sd"> >>> df.select(array_remove(df.data, 1)).collect()</span> |
| <span class="sd"> [Row(array_remove(data, 1)=[2, 3]), Row(array_remove(data, 1)=[])]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"array_remove"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">element</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="array_distinct"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_distinct.html#pyspark.sql.functions.array_distinct">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">array_distinct</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: removes duplicate values from the array.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> an array of unique values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([([1, 2, 3, 2],), ([4, 5, 5, 4],)], ['data'])</span> |
| <span class="sd"> >>> df.select(array_distinct(df.data)).collect()</span> |
| <span class="sd"> [Row(array_distinct(data)=[1, 2, 3]), Row(array_distinct(data)=[4, 5])]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"array_distinct"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="array_insert"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_insert.html#pyspark.sql.functions.array_insert">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">array_insert</span><span class="p">(</span><span class="n">arr</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">pos</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: adds an item into a given array at a specified array index.</span> |
| <span class="sd"> Array indices start at 1, or start from the end if index is negative.</span> |
| <span class="sd"> Index above array size appends the array, or prepends the array if index is negative,</span> |
| <span class="sd"> with 'null' elements.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> arr : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column containing an array</span> |
| <span class="sd"> pos : :class:`~pyspark.sql.Column` or str or int</span> |
| <span class="sd"> name of Numeric type column indicating position of insertion</span> |
| <span class="sd"> (starting at index 1, negative position is a start from the back of the array)</span> |
| <span class="sd"> value :</span> |
| <span class="sd"> a literal value, or a :class:`~pyspark.sql.Column` expression.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> an array of values, including the new specified value</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(['a', 'b', 'c'], 2, 'd'), (['c', 'b', 'a'], -2, 'd')],</span> |
| <span class="sd"> ... ['data', 'pos', 'val']</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.select(array_insert(df.data, df.pos.cast('integer'), df.val).alias('data')).collect()</span> |
| <span class="sd"> [Row(data=['a', 'd', 'b', 'c']), Row(data=['c', 'b', 'd', 'a'])]</span> |
| <span class="sd"> >>> df.select(array_insert(df.data, 5, 'hello').alias('data')).collect()</span> |
| <span class="sd"> [Row(data=['a', 'b', 'c', None, 'hello']), Row(data=['c', 'b', 'a', None, 'hello'])]</span> |
| <span class="sd"> """</span> |
| <span class="n">pos</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">pos</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pos</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">pos</span> |
| |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"array_insert"</span><span class="p">,</span> <span class="n">arr</span><span class="p">,</span> <span class="n">pos</span><span class="p">,</span> <span class="n">lit</span><span class="p">(</span><span class="n">value</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="array_intersect"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_intersect.html#pyspark.sql.functions.array_intersect">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">array_intersect</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: returns an array of the elements in the intersection of col1 and col2,</span> |
| <span class="sd"> without duplicates.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column containing array</span> |
| <span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column containing array</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> an array of values in the intersection of two arrays.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Row</span> |
| <span class="sd"> >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])])</span> |
| <span class="sd"> >>> df.select(array_intersect(df.c1, df.c2)).collect()</span> |
| <span class="sd"> [Row(array_intersect(c1, c2)=['a', 'c'])]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"array_intersect"</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="array_union"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_union.html#pyspark.sql.functions.array_union">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">array_union</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: returns an array of the elements in the union of col1 and col2,</span> |
| <span class="sd"> without duplicates.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column containing array</span> |
| <span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column containing array</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> an array of values in union of two arrays.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Row</span> |
| <span class="sd"> >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])])</span> |
| <span class="sd"> >>> df.select(array_union(df.c1, df.c2)).collect()</span> |
| <span class="sd"> [Row(array_union(c1, c2)=['b', 'a', 'c', 'd', 'f'])]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"array_union"</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="array_except"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_except.html#pyspark.sql.functions.array_except">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">array_except</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: returns an array of the elements in col1 but not in col2,</span> |
| <span class="sd"> without duplicates.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column containing array</span> |
| <span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column containing array</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> an array of values from first array that are not in the second.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Row</span> |
| <span class="sd"> >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2=["c", "d", "a", "f"])])</span> |
| <span class="sd"> >>> df.select(array_except(df.c1, df.c2)).collect()</span> |
| <span class="sd"> [Row(array_except(c1, c2)=['b'])]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"array_except"</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="array_compact"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_compact.html#pyspark.sql.functions.array_compact">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">array_compact</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: removes null values from the array.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> an array by excluding the null values.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([([1, None, 2, 3],), ([4, 5, None, 4],)], ['data'])</span> |
| <span class="sd"> >>> df.select(array_compact(df.data)).collect()</span> |
| <span class="sd"> [Row(array_compact(data)=[1, 2, 3]), Row(array_compact(data)=[4, 5, 4])]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"array_compact"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="array_append"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_append.html#pyspark.sql.functions.array_append">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">array_append</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: returns an array of the elements in col1 along</span> |
| <span class="sd"> with the added element in col2 at the last of the array.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column containing array</span> |
| <span class="sd"> value :</span> |
| <span class="sd"> a literal value, or a :class:`~pyspark.sql.Column` expression.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> an array of values from first array along with the element.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Row</span> |
| <span class="sd"> >>> df = spark.createDataFrame([Row(c1=["b", "a", "c"], c2="c")])</span> |
| <span class="sd"> >>> df.select(array_append(df.c1, df.c2)).collect()</span> |
| <span class="sd"> [Row(array_append(c1, c2)=['b', 'a', 'c', 'c'])]</span> |
| <span class="sd"> >>> df.select(array_append(df.c1, 'x')).collect()</span> |
| <span class="sd"> [Row(array_append(c1, x)=['b', 'a', 'c', 'x'])]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"array_append"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">lit</span><span class="p">(</span><span class="n">value</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="explode"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.explode.html#pyspark.sql.functions.explode">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">explode</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a new row for each element in the given array or map.</span> |
| <span class="sd"> Uses the default column name `col` for elements in the array and</span> |
| <span class="sd"> `key` and `value` for elements in the map unless specified otherwise.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> one row per array item or map key value.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> :meth:`pyspark.functions.posexplode`</span> |
| <span class="sd"> :meth:`pyspark.functions.explode_outer`</span> |
| <span class="sd"> :meth:`pyspark.functions.posexplode_outer`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Row</span> |
| <span class="sd"> >>> df = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])</span> |
| <span class="sd"> >>> df.select(explode(df.intlist).alias("anInt")).collect()</span> |
| <span class="sd"> [Row(anInt=1), Row(anInt=2), Row(anInt=3)]</span> |
| |
| <span class="sd"> >>> df.select(explode(df.mapfield).alias("key", "value")).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |key|value|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | a| b|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"explode"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="posexplode"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.posexplode.html#pyspark.sql.functions.posexplode">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">posexplode</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a new row for each element with position in the given array or map.</span> |
| <span class="sd"> Uses the default column name `pos` for position, and `col` for elements in the</span> |
| <span class="sd"> array and `key` and `value` for elements in the map unless specified otherwise.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> one row per array item or map key value including positions as a separate column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Row</span> |
| <span class="sd"> >>> df = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])</span> |
| <span class="sd"> >>> df.select(posexplode(df.intlist)).collect()</span> |
| <span class="sd"> [Row(pos=0, col=1), Row(pos=1, col=2), Row(pos=2, col=3)]</span> |
| |
| <span class="sd"> >>> df.select(posexplode(df.mapfield)).show()</span> |
| <span class="sd"> +---+---+-----+</span> |
| <span class="sd"> |pos|key|value|</span> |
| <span class="sd"> +---+---+-----+</span> |
| <span class="sd"> | 0| a| b|</span> |
| <span class="sd"> +---+---+-----+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"posexplode"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="inline"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.inline.html#pyspark.sql.functions.inline">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">inline</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Explodes an array of structs into a table.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> input column of values to explode.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> generator expression with the inline exploded result.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> :meth:`explode`</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Row</span> |
| <span class="sd"> >>> df = spark.createDataFrame([Row(structlist=[Row(a=1, b=2), Row(a=3, b=4)])])</span> |
| <span class="sd"> >>> df.select(inline(df.structlist)).show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | a| b|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | 1| 2|</span> |
| <span class="sd"> | 3| 4|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"inline"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="explode_outer"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.explode_outer.html#pyspark.sql.functions.explode_outer">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">explode_outer</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a new row for each element in the given array or map.</span> |
| <span class="sd"> Unlike explode, if the array/map is null or empty then null is produced.</span> |
| <span class="sd"> Uses the default column name `col` for elements in the array and</span> |
| <span class="sd"> `key` and `value` for elements in the map unless specified otherwise.</span> |
| |
| <span class="sd"> .. versionadded:: 2.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> one row per array item or map key value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(1, ["foo", "bar"], {"x": 1.0}), (2, [], {}), (3, None, None)],</span> |
| <span class="sd"> ... ("id", "an_array", "a_map")</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.select("id", "an_array", explode_outer("a_map")).show()</span> |
| <span class="sd"> +---+----------+----+-----+</span> |
| <span class="sd"> | id| an_array| key|value|</span> |
| <span class="sd"> +---+----------+----+-----+</span> |
| <span class="sd"> | 1|[foo, bar]| x| 1.0|</span> |
| <span class="sd"> | 2| []|NULL| NULL|</span> |
| <span class="sd"> | 3| NULL|NULL| NULL|</span> |
| <span class="sd"> +---+----------+----+-----+</span> |
| |
| <span class="sd"> >>> df.select("id", "a_map", explode_outer("an_array")).show()</span> |
| <span class="sd"> +---+----------+----+</span> |
| <span class="sd"> | id| a_map| col|</span> |
| <span class="sd"> +---+----------+----+</span> |
| <span class="sd"> | 1|{x -> 1.0}| foo|</span> |
| <span class="sd"> | 1|{x -> 1.0}| bar|</span> |
| <span class="sd"> | 2| {}|NULL|</span> |
| <span class="sd"> | 3| NULL|NULL|</span> |
| <span class="sd"> +---+----------+----+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"explode_outer"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="posexplode_outer"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.posexplode_outer.html#pyspark.sql.functions.posexplode_outer">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">posexplode_outer</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a new row for each element with position in the given array or map.</span> |
| <span class="sd"> Unlike posexplode, if the array/map is null or empty then the row (null, null) is produced.</span> |
| <span class="sd"> Uses the default column name `pos` for position, and `col` for elements in the</span> |
| <span class="sd"> array and `key` and `value` for elements in the map unless specified otherwise.</span> |
| |
| <span class="sd"> .. versionadded:: 2.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> one row per array item or map key value including positions as a separate column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(1, ["foo", "bar"], {"x": 1.0}), (2, [], {}), (3, None, None)],</span> |
| <span class="sd"> ... ("id", "an_array", "a_map")</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.select("id", "an_array", posexplode_outer("a_map")).show()</span> |
| <span class="sd"> +---+----------+----+----+-----+</span> |
| <span class="sd"> | id| an_array| pos| key|value|</span> |
| <span class="sd"> +---+----------+----+----+-----+</span> |
| <span class="sd"> | 1|[foo, bar]| 0| x| 1.0|</span> |
| <span class="sd"> | 2| []|NULL|NULL| NULL|</span> |
| <span class="sd"> | 3| NULL|NULL|NULL| NULL|</span> |
| <span class="sd"> +---+----------+----+----+-----+</span> |
| <span class="sd"> >>> df.select("id", "a_map", posexplode_outer("an_array")).show()</span> |
| <span class="sd"> +---+----------+----+----+</span> |
| <span class="sd"> | id| a_map| pos| col|</span> |
| <span class="sd"> +---+----------+----+----+</span> |
| <span class="sd"> | 1|{x -> 1.0}| 0| foo|</span> |
| <span class="sd"> | 1|{x -> 1.0}| 1| bar|</span> |
| <span class="sd"> | 2| {}|NULL|NULL|</span> |
| <span class="sd"> | 3| NULL|NULL|NULL|</span> |
| <span class="sd"> +---+----------+----+----+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"posexplode_outer"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="inline_outer"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.inline_outer.html#pyspark.sql.functions.inline_outer">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">inline_outer</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Explodes an array of structs into a table.</span> |
| <span class="sd"> Unlike inline, if the array is null or empty then null is produced for each nested column.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> input column of values to explode.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> generator expression with the inline exploded result.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> :meth:`explode_outer`</span> |
| <span class="sd"> :meth:`inline`</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Row</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... Row(id=1, structlist=[Row(a=1, b=2), Row(a=3, b=4)]),</span> |
| <span class="sd"> ... Row(id=2, structlist=[])</span> |
| <span class="sd"> ... ])</span> |
| <span class="sd"> >>> df.select('id', inline_outer(df.structlist)).show()</span> |
| <span class="sd"> +---+----+----+</span> |
| <span class="sd"> | id| a| b|</span> |
| <span class="sd"> +---+----+----+</span> |
| <span class="sd"> | 1| 1| 2|</span> |
| <span class="sd"> | 1| 3| 4|</span> |
| <span class="sd"> | 2|NULL|NULL|</span> |
| <span class="sd"> +---+----+----+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"inline_outer"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="get_json_object"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.get_json_object.html#pyspark.sql.functions.get_json_object">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">get_json_object</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Extracts json object from a json string based on json `path` specified, and returns json string</span> |
| <span class="sd"> of the extracted json object. It will return null if the input json string is invalid.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> string column in json format</span> |
| <span class="sd"> path : str</span> |
| <span class="sd"> path to the json object to extract</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> string representation of given JSON object value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ("key", "jstring"))</span> |
| <span class="sd"> >>> df.select(df.key, get_json_object(df.jstring, '$.f1').alias("c0"), \\</span> |
| <span class="sd"> ... get_json_object(df.jstring, '$.f2').alias("c1") ).collect()</span> |
| <span class="sd"> [Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"get_json_object"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">path</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="json_tuple"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.json_tuple.html#pyspark.sql.functions.json_tuple">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">json_tuple</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="o">*</span><span class="n">fields</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Creates a new row for a json column according to the given field names.</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> string column in json format</span> |
| <span class="sd"> fields : str</span> |
| <span class="sd"> a field or fields to extract</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a new row for each given field value from json object</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> data = [("1", '''{"f1": "value1", "f2": "value2"}'''), ("2", '''{"f1": "value12"}''')]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ("key", "jstring"))</span> |
| <span class="sd"> >>> df.select(df.key, json_tuple(df.jstring, 'f1', 'f2')).collect()</span> |
| <span class="sd"> [Row(key='1', c0='value1', c1='value2'), Row(key='2', c0='value12', c1=None)]</span> |
| <span class="sd"> """</span> |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"json_tuple"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">fields</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="from_json"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.from_json.html#pyspark.sql.functions.from_json">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">from_json</span><span class="p">(</span> |
| <span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">ArrayType</span><span class="p">,</span> <span class="n">StructType</span><span class="p">,</span> <span class="n">Column</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> |
| <span class="n">options</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Parses a column containing a JSON string into a :class:`MapType` with :class:`StringType`</span> |
| <span class="sd"> as keys type, :class:`StructType` or :class:`ArrayType` with</span> |
| <span class="sd"> the specified schema. Returns `null`, in the case of an unparseable string.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> a column or column name in JSON format</span> |
| <span class="sd"> schema : :class:`DataType` or str</span> |
| <span class="sd"> a StructType, ArrayType of StructType or Python string literal with a DDL-formatted string</span> |
| <span class="sd"> to use when parsing the json column</span> |
| <span class="sd"> options : dict, optional</span> |
| <span class="sd"> options to control parsing. accepts the same options as the json datasource.</span> |
| <span class="sd"> See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option>`_</span> |
| <span class="sd"> for the version you use.</span> |
| |
| <span class="sd"> .. # noqa</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a new column of complex type from given JSON object.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.types import *</span> |
| <span class="sd"> >>> data = [(1, '''{"a": 1}''')]</span> |
| <span class="sd"> >>> schema = StructType([StructField("a", IntegerType())])</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ("key", "value"))</span> |
| <span class="sd"> >>> df.select(from_json(df.value, schema).alias("json")).collect()</span> |
| <span class="sd"> [Row(json=Row(a=1))]</span> |
| <span class="sd"> >>> df.select(from_json(df.value, "a INT").alias("json")).collect()</span> |
| <span class="sd"> [Row(json=Row(a=1))]</span> |
| <span class="sd"> >>> df.select(from_json(df.value, "MAP<STRING,INT>").alias("json")).collect()</span> |
| <span class="sd"> [Row(json={'a': 1})]</span> |
| <span class="sd"> >>> data = [(1, '''[{"a": 1}]''')]</span> |
| <span class="sd"> >>> schema = ArrayType(StructType([StructField("a", IntegerType())]))</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ("key", "value"))</span> |
| <span class="sd"> >>> df.select(from_json(df.value, schema).alias("json")).collect()</span> |
| <span class="sd"> [Row(json=[Row(a=1)])]</span> |
| <span class="sd"> >>> schema = schema_of_json(lit('''{"a": 0}'''))</span> |
| <span class="sd"> >>> df.select(from_json(df.value, schema).alias("json")).collect()</span> |
| <span class="sd"> [Row(json=Row(a=None))]</span> |
| <span class="sd"> >>> data = [(1, '''[1, 2, 3]''')]</span> |
| <span class="sd"> >>> schema = ArrayType(IntegerType())</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ("key", "value"))</span> |
| <span class="sd"> >>> df.select(from_json(df.value, schema).alias("json")).collect()</span> |
| <span class="sd"> [Row(json=[1, 2, 3])]</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">DataType</span><span class="p">):</span> |
| <span class="n">schema</span> <span class="o">=</span> <span class="n">schema</span><span class="o">.</span><span class="n">json</span><span class="p">()</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span> |
| <span class="n">schema</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"from_json"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">schema</span><span class="p">,</span> <span class="n">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="to_json"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_json.html#pyspark.sql.functions.to_json">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">to_json</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">options</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Converts a column containing a :class:`StructType`, :class:`ArrayType` or a :class:`MapType`</span> |
| <span class="sd"> into a JSON string. Throws an exception, in the case of an unsupported type.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column containing a struct, an array or a map.</span> |
| <span class="sd"> options : dict, optional</span> |
| <span class="sd"> options to control converting. accepts the same options as the JSON datasource.</span> |
| <span class="sd"> See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option>`_</span> |
| <span class="sd"> for the version you use.</span> |
| <span class="sd"> Additionally the function supports the `pretty` option which enables</span> |
| <span class="sd"> pretty JSON generation.</span> |
| |
| <span class="sd"> .. # noqa</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> JSON object as string column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Row</span> |
| <span class="sd"> >>> from pyspark.sql.types import *</span> |
| <span class="sd"> >>> data = [(1, Row(age=2, name='Alice'))]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ("key", "value"))</span> |
| <span class="sd"> >>> df.select(to_json(df.value).alias("json")).collect()</span> |
| <span class="sd"> [Row(json='{"age":2,"name":"Alice"}')]</span> |
| <span class="sd"> >>> data = [(1, [Row(age=2, name='Alice'), Row(age=3, name='Bob')])]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ("key", "value"))</span> |
| <span class="sd"> >>> df.select(to_json(df.value).alias("json")).collect()</span> |
| <span class="sd"> [Row(json='[{"age":2,"name":"Alice"},{"age":3,"name":"Bob"}]')]</span> |
| <span class="sd"> >>> data = [(1, {"name": "Alice"})]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ("key", "value"))</span> |
| <span class="sd"> >>> df.select(to_json(df.value).alias("json")).collect()</span> |
| <span class="sd"> [Row(json='{"name":"Alice"}')]</span> |
| <span class="sd"> >>> data = [(1, [{"name": "Alice"}, {"name": "Bob"}])]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ("key", "value"))</span> |
| <span class="sd"> >>> df.select(to_json(df.value).alias("json")).collect()</span> |
| <span class="sd"> [Row(json='[{"name":"Alice"},{"name":"Bob"}]')]</span> |
| <span class="sd"> >>> data = [(1, ["Alice", "Bob"])]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ("key", "value"))</span> |
| <span class="sd"> >>> df.select(to_json(df.value).alias("json")).collect()</span> |
| <span class="sd"> [Row(json='["Alice","Bob"]')]</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"to_json"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="schema_of_json"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.schema_of_json.html#pyspark.sql.functions.schema_of_json">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">schema_of_json</span><span class="p">(</span><span class="n">json</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">options</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Parses a JSON string and infers its schema in DDL format.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> json : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> a JSON string or a foldable string column containing a JSON string.</span> |
| <span class="sd"> options : dict, optional</span> |
| <span class="sd"> options to control parsing. accepts the same options as the JSON datasource.</span> |
| <span class="sd"> See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-json.html#data-source-option>`_</span> |
| <span class="sd"> for the version you use.</span> |
| |
| <span class="sd"> .. # noqa</span> |
| |
| <span class="sd"> .. versionchanged:: 3.0.0</span> |
| <span class="sd"> It accepts `options` parameter to control schema inferring.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a string representation of a :class:`StructType` parsed from given JSON.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(schema_of_json(lit('{"a": 0}')).alias("json")).collect()</span> |
| <span class="sd"> [Row(json='STRUCT<a: BIGINT>')]</span> |
| <span class="sd"> >>> schema = schema_of_json('{a: 1}', {'allowUnquotedFieldNames':'true'})</span> |
| <span class="sd"> >>> df.select(schema.alias("json")).collect()</span> |
| <span class="sd"> [Row(json='STRUCT<a: BIGINT>')]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">json</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">json</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">json</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">json</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_COLUMN_OR_STR"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"arg_name"</span><span class="p">:</span> <span class="s2">"json"</span><span class="p">,</span> <span class="s2">"arg_type"</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">json</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"schema_of_json"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="json_array_length"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.json_array_length.html#pyspark.sql.functions.json_array_length">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">json_array_length</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the number of elements in the outermost JSON array. `NULL` is returned in case of</span> |
| <span class="sd"> any other valid JSON string, `NULL` or an invalid JSON.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col: :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> length of json array.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(None,), ('[1, 2, 3]',), ('[]',)], ['data'])</span> |
| <span class="sd"> >>> df.select(json_array_length(df.data).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=None), Row(r=3), Row(r=0)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"json_array_length"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="json_object_keys"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.json_object_keys.html#pyspark.sql.functions.json_object_keys">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">json_object_keys</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns all the keys of the outermost JSON object as an array. If a valid JSON object is</span> |
| <span class="sd"> given, all the keys of the outermost object will be returned as an array. If it is any</span> |
| <span class="sd"> other valid JSON string, an invalid JSON string or an empty string, the function returns null.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col: :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> all the keys of the outermost JSON object.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(None,), ('{}',), ('{"key1":1, "key2":2}',)], ['data'])</span> |
| <span class="sd"> >>> df.select(json_object_keys(df.data).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=None), Row(r=[]), Row(r=['key1', 'key2'])]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"json_object_keys"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="schema_of_csv"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.schema_of_csv.html#pyspark.sql.functions.schema_of_csv">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">schema_of_csv</span><span class="p">(</span><span class="n">csv</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">options</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Parses a CSV string and infers its schema in DDL format.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> csv : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> a CSV string or a foldable string column containing a CSV string.</span> |
| <span class="sd"> options : dict, optional</span> |
| <span class="sd"> options to control parsing. accepts the same options as the CSV datasource.</span> |
| <span class="sd"> See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option>`_</span> |
| <span class="sd"> for the version you use.</span> |
| |
| <span class="sd"> .. # noqa</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a string representation of a :class:`StructType` parsed from given CSV.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(schema_of_csv(lit('1|a'), {'sep':'|'}).alias("csv")).collect()</span> |
| <span class="sd"> [Row(csv='STRUCT<_c0: INT, _c1: STRING>')]</span> |
| <span class="sd"> >>> df.select(schema_of_csv('1|a', {'sep':'|'}).alias("csv")).collect()</span> |
| <span class="sd"> [Row(csv='STRUCT<_c0: INT, _c1: STRING>')]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">csv</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">csv</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">csv</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span> |
| <span class="n">col</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">csv</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_COLUMN_OR_STR"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"arg_name"</span><span class="p">:</span> <span class="s2">"csv"</span><span class="p">,</span> <span class="s2">"arg_type"</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">csv</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"schema_of_csv"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="to_csv"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.to_csv.html#pyspark.sql.functions.to_csv">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">to_csv</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">options</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Converts a column containing a :class:`StructType` into a CSV string.</span> |
| <span class="sd"> Throws an exception, in the case of an unsupported type.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column containing a struct.</span> |
| <span class="sd"> options: dict, optional</span> |
| <span class="sd"> options to control converting. accepts the same options as the CSV datasource.</span> |
| <span class="sd"> See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option>`_</span> |
| <span class="sd"> for the version you use.</span> |
| |
| <span class="sd"> .. # noqa</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a CSV string converted from given :class:`StructType`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Row</span> |
| <span class="sd"> >>> data = [(1, Row(age=2, name='Alice'))]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ("key", "value"))</span> |
| <span class="sd"> >>> df.select(to_csv(df.value).alias("csv")).collect()</span> |
| <span class="sd"> [Row(csv='2,Alice')]</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"to_csv"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="size"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.size.html#pyspark.sql.functions.size">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">size</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: returns the length of the array or map stored in the column.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> length of the array/map.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([([1, 2, 3],),([1],),([],)], ['data'])</span> |
| <span class="sd"> >>> df.select(size(df.data)).collect()</span> |
| <span class="sd"> [Row(size(data)=3), Row(size(data)=1), Row(size(data)=0)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"size"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="array_min"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_min.html#pyspark.sql.functions.array_min">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">array_min</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: returns the minimum value of the array.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> minimum value of array.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], ['data'])</span> |
| <span class="sd"> >>> df.select(array_min(df.data).alias('min')).collect()</span> |
| <span class="sd"> [Row(min=1), Row(min=-1)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"array_min"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="array_max"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_max.html#pyspark.sql.functions.array_max">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">array_max</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: returns the maximum value of the array.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> maximum value of an array.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([([2, 1, 3],), ([None, 10, -1],)], ['data'])</span> |
| <span class="sd"> >>> df.select(array_max(df.data).alias('max')).collect()</span> |
| <span class="sd"> [Row(max=3), Row(max=10)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"array_max"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="array_size"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_size.html#pyspark.sql.functions.array_size">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">array_size</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the total number of elements in the array. The function returns null for null input.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> total number of elements in the array.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([([2, 1, 3],), (None,)], ['data'])</span> |
| <span class="sd"> >>> df.select(array_size(df.data).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=3), Row(r=None)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"array_size"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="cardinality"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.cardinality.html#pyspark.sql.functions.cardinality">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">cardinality</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: returns the length of the array or map stored in the column.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target column to compute on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> length of the array/map.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [([1, 2, 3],),([1],),([],)], ['data']</span> |
| <span class="sd"> ... ).select(sf.cardinality("data")).show()</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> |cardinality(data)|</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"cardinality"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="sort_array"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sort_array.html#pyspark.sql.functions.sort_array">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">sort_array</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">asc</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: sorts the input array in ascending or descending order according</span> |
| <span class="sd"> to the natural ordering of the array elements. Null elements will be placed at the beginning</span> |
| <span class="sd"> of the returned array in ascending order or at the end of the returned array in descending</span> |
| <span class="sd"> order.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| <span class="sd"> asc : bool, optional</span> |
| <span class="sd"> whether to sort in ascending or descending order. If `asc` is True (default)</span> |
| <span class="sd"> then ascending and if False then descending.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> sorted array.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([([2, 1, None, 3],),([1],),([],)], ['data'])</span> |
| <span class="sd"> >>> df.select(sort_array(df.data).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=[None, 1, 2, 3]), Row(r=[1]), Row(r=[])]</span> |
| <span class="sd"> >>> df.select(sort_array(df.data, asc=False).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=[3, 2, 1, None]), Row(r=[1]), Row(r=[])]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"sort_array"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">asc</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="array_sort"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_sort.html#pyspark.sql.functions.array_sort">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">array_sort</span><span class="p">(</span> |
| <span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">comparator</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: sorts the input array in ascending order. The elements of the input array</span> |
| <span class="sd"> must be orderable. Null elements will be placed at the end of the returned array.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Can take a `comparator` function.</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| <span class="sd"> comparator : callable, optional</span> |
| <span class="sd"> A binary ``(Column, Column) -> Column: ...``.</span> |
| <span class="sd"> The comparator will take two</span> |
| <span class="sd"> arguments representing two elements of the array. It returns a negative integer, 0, or a</span> |
| <span class="sd"> positive integer as the first element is less than, equal to, or greater than the second</span> |
| <span class="sd"> element. If the comparator function returns null, the function will fail and raise an error.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> sorted array.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([([2, 1, None, 3],),([1],),([],)], ['data'])</span> |
| <span class="sd"> >>> df.select(array_sort(df.data).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=[1, 2, 3, None]), Row(r=[1]), Row(r=[])]</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(["foo", "foobar", None, "bar"],),(["foo"],),([],)], ['data'])</span> |
| <span class="sd"> >>> df.select(array_sort(</span> |
| <span class="sd"> ... "data",</span> |
| <span class="sd"> ... lambda x, y: when(x.isNull() | y.isNull(), lit(0)).otherwise(length(y) - length(x))</span> |
| <span class="sd"> ... ).alias("r")).collect()</span> |
| <span class="sd"> [Row(r=['foobar', 'foo', None, 'bar']), Row(r=['foo']), Row(r=[])]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">comparator</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"array_sort"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">"ArraySort"</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">comparator</span><span class="p">])</span></div> |
| |
| |
| <div class="viewcode-block" id="shuffle"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.shuffle.html#pyspark.sql.functions.shuffle">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">shuffle</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: Generates a random permutation of the given array.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The function is non-deterministic.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> an array of elements in random order.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([([1, 20, 3, 5],), ([1, 20, None, 3],)], ['data'])</span> |
| <span class="sd"> >>> df.select(shuffle(df.data).alias('s')).collect() # doctest: +SKIP</span> |
| <span class="sd"> [Row(s=[3, 1, 5, 20]), Row(s=[20, None, 3, 1])]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"shuffle"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="reverse"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.reverse.html#pyspark.sql.functions.reverse">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">reverse</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: returns a reversed string or an array with reverse order of elements.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> array of elements in reverse order.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('Spark SQL',)], ['data'])</span> |
| <span class="sd"> >>> df.select(reverse(df.data).alias('s')).collect()</span> |
| <span class="sd"> [Row(s='LQS krapS')]</span> |
| <span class="sd"> >>> df = spark.createDataFrame([([2, 1, 3],) ,([1],) ,([],)], ['data'])</span> |
| <span class="sd"> >>> df.select(reverse(df.data).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=[3, 1, 2]), Row(r=[1]), Row(r=[])]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"reverse"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="flatten"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.flatten.html#pyspark.sql.functions.flatten">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">flatten</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: creates a single array from an array of arrays.</span> |
| <span class="sd"> If a structure of nested arrays is deeper than two levels,</span> |
| <span class="sd"> only one level of nesting is removed.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> flattened array.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([([[1, 2, 3], [4, 5], [6]],), ([None, [4, 5]],)], ['data'])</span> |
| <span class="sd"> >>> df.show(truncate=False)</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> |data |</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> |[[1, 2, 3], [4, 5], [6]]|</span> |
| <span class="sd"> |[NULL, [4, 5]] |</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> >>> df.select(flatten(df.data).alias('r')).show()</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> | r|</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> |[1, 2, 3, 4, 5, 6]|</span> |
| <span class="sd"> | NULL|</span> |
| <span class="sd"> +------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"flatten"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="map_contains_key"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.map_contains_key.html#pyspark.sql.functions.map_contains_key">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">map_contains_key</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns true if the map contains the key.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| <span class="sd"> value :</span> |
| <span class="sd"> a literal value</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> True if key is in the map and False otherwise.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.functions import map_contains_key</span> |
| <span class="sd"> >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data")</span> |
| <span class="sd"> >>> df.select(map_contains_key("data", 1)).show()</span> |
| <span class="sd"> +---------------------------------+</span> |
| <span class="sd"> |array_contains(map_keys(data), 1)|</span> |
| <span class="sd"> +---------------------------------+</span> |
| <span class="sd"> | true|</span> |
| <span class="sd"> +---------------------------------+</span> |
| <span class="sd"> >>> df.select(map_contains_key("data", -1)).show()</span> |
| <span class="sd"> +----------------------------------+</span> |
| <span class="sd"> |array_contains(map_keys(data), -1)|</span> |
| <span class="sd"> +----------------------------------+</span> |
| <span class="sd"> | false|</span> |
| <span class="sd"> +----------------------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"map_contains_key"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">value</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="map_keys"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.map_keys.html#pyspark.sql.functions.map_keys">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">map_keys</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: Returns an unordered array containing the keys of the map.</span> |
| |
| <span class="sd"> .. versionadded:: 2.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> keys of the map as an array.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.functions import map_keys</span> |
| <span class="sd"> >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data")</span> |
| <span class="sd"> >>> df.select(map_keys("data").alias("keys")).show()</span> |
| <span class="sd"> +------+</span> |
| <span class="sd"> | keys|</span> |
| <span class="sd"> +------+</span> |
| <span class="sd"> |[1, 2]|</span> |
| <span class="sd"> +------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"map_keys"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="map_values"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.map_values.html#pyspark.sql.functions.map_values">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">map_values</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: Returns an unordered array containing the values of the map.</span> |
| |
| <span class="sd"> .. versionadded:: 2.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> values of the map as an array.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.functions import map_values</span> |
| <span class="sd"> >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data")</span> |
| <span class="sd"> >>> df.select(map_values("data").alias("values")).show()</span> |
| <span class="sd"> +------+</span> |
| <span class="sd"> |values|</span> |
| <span class="sd"> +------+</span> |
| <span class="sd"> |[a, b]|</span> |
| <span class="sd"> +------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"map_values"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="map_entries"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.map_entries.html#pyspark.sql.functions.map_entries">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">map_entries</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: Returns an unordered array of all entries in the given map.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> an array of key value pairs as a struct type</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.functions import map_entries</span> |
| <span class="sd"> >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as data")</span> |
| <span class="sd"> >>> df = df.select(map_entries("data").alias("entries"))</span> |
| <span class="sd"> >>> df.show()</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> | entries|</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> |[{1, a}, {2, b}]|</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> >>> df.printSchema()</span> |
| <span class="sd"> root</span> |
| <span class="sd"> |-- entries: array (nullable = false)</span> |
| <span class="sd"> | |-- element: struct (containsNull = false)</span> |
| <span class="sd"> | | |-- key: integer (nullable = false)</span> |
| <span class="sd"> | | |-- value: string (nullable = false)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"map_entries"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="map_from_entries"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.map_from_entries.html#pyspark.sql.functions.map_from_entries">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">map_from_entries</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: Converts an array of entries (key value struct types) to a map</span> |
| <span class="sd"> of values.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a map created from the given array of entries.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.functions import map_from_entries</span> |
| <span class="sd"> >>> df = spark.sql("SELECT array(struct(1, 'a'), struct(2, 'b')) as data")</span> |
| <span class="sd"> >>> df.select(map_from_entries("data").alias("map")).show()</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> | map|</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> |{1 -> a, 2 -> b}|</span> |
| <span class="sd"> +----------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"map_from_entries"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="array_repeat"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.array_repeat.html#pyspark.sql.functions.array_repeat">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">array_repeat</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">count</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="nb">int</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: creates an array containing a column repeated count times.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column name or column that contains the element to be repeated</span> |
| <span class="sd"> count : :class:`~pyspark.sql.Column` or str or int</span> |
| <span class="sd"> column name, column, or int containing the number of times to repeat the first argument</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> an array of repeated elements.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('ab',)], ['data'])</span> |
| <span class="sd"> >>> df.select(array_repeat(df.data, 3).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=['ab', 'ab', 'ab'])]</span> |
| <span class="sd"> """</span> |
| <span class="n">count</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">count</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">count</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">count</span> |
| |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"array_repeat"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">count</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="arrays_zip"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.arrays_zip.html#pyspark.sql.functions.arrays_zip">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">arrays_zip</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Collection function: Returns a merged array of structs in which the N-th struct contains all</span> |
| <span class="sd"> N-th values of input arrays. If one of the arrays is shorter than others then</span> |
| <span class="sd"> resulting struct type value will be a `null` for missing elements.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> columns of arrays to be merged.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> merged array of entries.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.functions import arrays_zip</span> |
| <span class="sd"> >>> df = spark.createDataFrame([([1, 2, 3], [2, 4, 6], [3, 6])], ['vals1', 'vals2', 'vals3'])</span> |
| <span class="sd"> >>> df = df.select(arrays_zip(df.vals1, df.vals2, df.vals3).alias('zipped'))</span> |
| <span class="sd"> >>> df.show(truncate=False)</span> |
| <span class="sd"> +------------------------------------+</span> |
| <span class="sd"> |zipped |</span> |
| <span class="sd"> +------------------------------------+</span> |
| <span class="sd"> |[{1, 2, 3}, {2, 4, 6}, {3, 6, NULL}]|</span> |
| <span class="sd"> +------------------------------------+</span> |
| <span class="sd"> >>> df.printSchema()</span> |
| <span class="sd"> root</span> |
| <span class="sd"> |-- zipped: array (nullable = true)</span> |
| <span class="sd"> | |-- element: struct (containsNull = false)</span> |
| <span class="sd"> | | |-- vals1: long (nullable = true)</span> |
| <span class="sd"> | | |-- vals2: long (nullable = true)</span> |
| <span class="sd"> | | |-- vals3: long (nullable = true)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">"arrays_zip"</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">map_concat</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">map_concat</span><span class="p">(</span><span class="n">__cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">"ColumnOrName_"</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"ColumnOrName_"</span><span class="p">,</span> <span class="o">...</span><span class="p">]])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <div class="viewcode-block" id="map_concat"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.map_concat.html#pyspark.sql.functions.map_concat">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">map_concat</span><span class="p">(</span> |
| <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">"ColumnOrName_"</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"ColumnOrName_"</span><span class="p">,</span> <span class="o">...</span><span class="p">]]]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the union of all the given maps.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column names or :class:`~pyspark.sql.Column`\\s</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a map of merged entries from other maps.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.functions import map_concat</span> |
| <span class="sd"> >>> df = spark.sql("SELECT map(1, 'a', 2, 'b') as map1, map(3, 'c') as map2")</span> |
| <span class="sd"> >>> df.select(map_concat("map1", "map2").alias("map3")).show(truncate=False)</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> |map3 |</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> |{1 -> a, 2 -> b, 3 -> c}|</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">set</span><span class="p">)):</span> |
| <span class="n">cols</span> <span class="o">=</span> <span class="n">cols</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="c1"># type: ignore[assignment]</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">"map_concat"</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span> <span class="c1"># type: ignore[arg-type]</span></div> |
| |
| |
| <div class="viewcode-block" id="sequence"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sequence.html#pyspark.sql.functions.sequence">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">sequence</span><span class="p">(</span> |
| <span class="n">start</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">stop</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">step</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Generate a sequence of integers from `start` to `stop`, incrementing by `step`.</span> |
| <span class="sd"> If `step` is not set, incrementing by 1 if `start` is less than or equal to `stop`,</span> |
| <span class="sd"> otherwise -1.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> start : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> starting value (inclusive)</span> |
| <span class="sd"> stop : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> last values (inclusive)</span> |
| <span class="sd"> step : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> value to add to current to get next element (default is 1)</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> an array of sequence values</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df1 = spark.createDataFrame([(-2, 2)], ('C1', 'C2'))</span> |
| <span class="sd"> >>> df1.select(sequence('C1', 'C2').alias('r')).collect()</span> |
| <span class="sd"> [Row(r=[-2, -1, 0, 1, 2])]</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([(4, -4, -2)], ('C1', 'C2', 'C3'))</span> |
| <span class="sd"> >>> df2.select(sequence('C1', 'C2', 'C3').alias('r')).collect()</span> |
| <span class="sd"> [Row(r=[4, 2, 0, -2, -4])]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">step</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"sequence"</span><span class="p">,</span> <span class="n">start</span><span class="p">,</span> <span class="n">stop</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"sequence"</span><span class="p">,</span> <span class="n">start</span><span class="p">,</span> <span class="n">stop</span><span class="p">,</span> <span class="n">step</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="from_csv"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.from_csv.html#pyspark.sql.functions.from_csv">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">from_csv</span><span class="p">(</span> |
| <span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> |
| <span class="n">options</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Parses a column containing a CSV string to a row with the specified schema.</span> |
| <span class="sd"> Returns `null`, in the case of an unparseable string.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> a column or column name in CSV format</span> |
| <span class="sd"> schema :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> a column, or Python string literal with schema in DDL format, to use when parsing the CSV column.</span> |
| <span class="sd"> options : dict, optional</span> |
| <span class="sd"> options to control parsing. accepts the same options as the CSV datasource.</span> |
| <span class="sd"> See `Data Source Option <https://spark.apache.org/docs/latest/sql-data-sources-csv.html#data-source-option>`_</span> |
| <span class="sd"> for the version you use.</span> |
| |
| <span class="sd"> .. # noqa</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a column of parsed CSV values</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> data = [("1,2,3",)]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ("value",))</span> |
| <span class="sd"> >>> df.select(from_csv(df.value, "a INT, b INT, c INT").alias("csv")).collect()</span> |
| <span class="sd"> [Row(csv=Row(a=1, b=2, c=3))]</span> |
| <span class="sd"> >>> value = data[0][0]</span> |
| <span class="sd"> >>> df.select(from_csv(df.value, schema_of_csv(value)).alias("csv")).collect()</span> |
| <span class="sd"> [Row(csv=Row(_c0=1, _c1=2, _c2=3))]</span> |
| <span class="sd"> >>> data = [(" abc",)]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ("value",))</span> |
| <span class="sd"> >>> options = {'ignoreLeadingWhiteSpace': True}</span> |
| <span class="sd"> >>> df.select(from_csv(df.value, "s string", options).alias("csv")).collect()</span> |
| <span class="sd"> [Row(csv=Row(s='abc'))]</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">get_active_spark_context</span><span class="p">()</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">schema</span> <span class="o">=</span> <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span> |
| <span class="n">schema</span> <span class="o">=</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_COLUMN_OR_STR"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"arg_name"</span><span class="p">:</span> <span class="s2">"schema"</span><span class="p">,</span> <span class="s2">"arg_type"</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"from_csv"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">),</span> <span class="n">schema</span><span class="p">,</span> <span class="n">_options_to_str</span><span class="p">(</span><span class="n">options</span><span class="p">))</span></div> |
| |
| |
| <span class="k">def</span> <span class="nf">_unresolved_named_lambda_variable</span><span class="p">(</span><span class="o">*</span><span class="n">name_parts</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Create `o.a.s.sql.expressions.UnresolvedNamedLambdaVariable`,</span> |
| <span class="sd"> convert it to o.s.sql.Column and wrap in Python `Column`</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> name_parts : str</span> |
| <span class="sd"> """</span> |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span> |
| <span class="n">name_parts_seq</span> <span class="o">=</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">name_parts</span><span class="p">)</span> |
| <span class="n">expressions</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">JVMView</span><span class="p">,</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="p">)</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">catalyst</span><span class="o">.</span><span class="n">expressions</span> |
| <span class="k">return</span> <span class="n">Column</span><span class="p">(</span> |
| <span class="n">cast</span><span class="p">(</span><span class="n">JVMView</span><span class="p">,</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="p">)</span><span class="o">.</span><span class="n">Column</span><span class="p">(</span><span class="n">expressions</span><span class="o">.</span><span class="n">UnresolvedNamedLambdaVariable</span><span class="p">(</span><span class="n">name_parts_seq</span><span class="p">))</span> |
| <span class="p">)</span> |
| |
| |
| <span class="k">def</span> <span class="nf">_get_lambda_parameters</span><span class="p">(</span><span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">)</span> <span class="o">-></span> <span class="n">ValuesView</span><span class="p">[</span><span class="n">inspect</span><span class="o">.</span><span class="n">Parameter</span><span class="p">]:</span> |
| <span class="n">signature</span> <span class="o">=</span> <span class="n">inspect</span><span class="o">.</span><span class="n">signature</span><span class="p">(</span><span class="n">f</span><span class="p">)</span> |
| <span class="n">parameters</span> <span class="o">=</span> <span class="n">signature</span><span class="o">.</span><span class="n">parameters</span><span class="o">.</span><span class="n">values</span><span class="p">()</span> |
| |
| <span class="c1"># We should exclude functions that use</span> |
| <span class="c1"># variable args and keyword argnames</span> |
| <span class="c1"># as well as keyword only args</span> |
| <span class="n">supported_parameter_types</span> <span class="o">=</span> <span class="p">{</span> |
| <span class="n">inspect</span><span class="o">.</span><span class="n">Parameter</span><span class="o">.</span><span class="n">POSITIONAL_OR_KEYWORD</span><span class="p">,</span> |
| <span class="n">inspect</span><span class="o">.</span><span class="n">Parameter</span><span class="o">.</span><span class="n">POSITIONAL_ONLY</span><span class="p">,</span> |
| <span class="p">}</span> |
| |
| <span class="c1"># Validate that</span> |
| <span class="c1"># function arity is between 1 and 3</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="p">(</span><span class="mi">1</span> <span class="o"><=</span> <span class="nb">len</span><span class="p">(</span><span class="n">parameters</span><span class="p">)</span> <span class="o"><=</span> <span class="mi">3</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"WRONG_NUM_ARGS_FOR_HIGHER_ORDER_FUNCTION"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"func_name"</span><span class="p">:</span> <span class="n">f</span><span class="o">.</span><span class="vm">__name__</span><span class="p">,</span> <span class="s2">"num_args"</span><span class="p">:</span> <span class="nb">str</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">parameters</span><span class="p">))},</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># and all arguments can be used as positional</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span><span class="n">p</span><span class="o">.</span><span class="n">kind</span> <span class="ow">in</span> <span class="n">supported_parameter_types</span> <span class="k">for</span> <span class="n">p</span> <span class="ow">in</span> <span class="n">parameters</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"UNSUPPORTED_PARAM_TYPE_FOR_HIGHER_ORDER_FUNCTION"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"func_name"</span><span class="p">:</span> <span class="n">f</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">parameters</span> |
| |
| |
| <span class="k">def</span> <span class="nf">_create_lambda</span><span class="p">(</span><span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">)</span> <span class="o">-></span> <span class="n">Callable</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Create `o.a.s.sql.expressions.LambdaFunction` corresponding</span> |
| <span class="sd"> to transformation described by f</span> |
| |
| <span class="sd"> :param f: A Python of one of the following forms:</span> |
| <span class="sd"> - (Column) -> Column: ...</span> |
| <span class="sd"> - (Column, Column) -> Column: ...</span> |
| <span class="sd"> - (Column, Column, Column) -> Column: ...</span> |
| <span class="sd"> """</span> |
| <span class="n">parameters</span> <span class="o">=</span> <span class="n">_get_lambda_parameters</span><span class="p">(</span><span class="n">f</span><span class="p">)</span> |
| |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span> |
| <span class="n">expressions</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">JVMView</span><span class="p">,</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="p">)</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">catalyst</span><span class="o">.</span><span class="n">expressions</span> |
| |
| <span class="n">argnames</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"x"</span><span class="p">,</span> <span class="s2">"y"</span><span class="p">,</span> <span class="s2">"z"</span><span class="p">]</span> |
| <span class="n">args</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">_unresolved_named_lambda_variable</span><span class="p">(</span> |
| <span class="n">expressions</span><span class="o">.</span><span class="n">UnresolvedNamedLambdaVariable</span><span class="o">.</span><span class="n">freshVarName</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">argnames</span><span class="p">[:</span> <span class="nb">len</span><span class="p">(</span><span class="n">parameters</span><span class="p">)]</span> |
| <span class="p">]</span> |
| |
| <span class="n">result</span> <span class="o">=</span> <span class="n">f</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">result</span><span class="p">,</span> <span class="n">Column</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"HIGHER_ORDER_FUNCTION_SHOULD_RETURN_COLUMN"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"func_name"</span><span class="p">:</span> <span class="n">f</span><span class="o">.</span><span class="vm">__name__</span><span class="p">,</span> <span class="s2">"return_type"</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">result</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="n">jexpr</span> <span class="o">=</span> <span class="n">result</span><span class="o">.</span><span class="n">_jc</span><span class="o">.</span><span class="n">expr</span><span class="p">()</span> |
| <span class="n">jargs</span> <span class="o">=</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="p">[</span><span class="n">arg</span><span class="o">.</span><span class="n">_jc</span><span class="o">.</span><span class="n">expr</span><span class="p">()</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span><span class="p">])</span> |
| |
| <span class="k">return</span> <span class="n">expressions</span><span class="o">.</span><span class="n">LambdaFunction</span><span class="p">(</span><span class="n">jexpr</span><span class="p">,</span> <span class="n">jargs</span><span class="p">,</span> <span class="kc">False</span><span class="p">)</span> |
| |
| |
| <span class="k">def</span> <span class="nf">_invoke_higher_order_function</span><span class="p">(</span> |
| <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="n">cols</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">],</span> |
| <span class="n">funs</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Callable</span><span class="p">],</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Invokes expression identified by name,</span> |
| <span class="sd"> (relative to ```org.apache.spark.sql.catalyst.expressions``)</span> |
| <span class="sd"> and wraps the result with Column (first Scala one, then Python).</span> |
| |
| <span class="sd"> :param name: Name of the expression</span> |
| <span class="sd"> :param cols: a list of columns</span> |
| <span class="sd"> :param funs: a list of (*Column) -> Column functions.</span> |
| |
| <span class="sd"> :return: a Column</span> |
| <span class="sd"> """</span> |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span> |
| <span class="n">expressions</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">JVMView</span><span class="p">,</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="p">)</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">catalyst</span><span class="o">.</span><span class="n">expressions</span> |
| <span class="n">expr</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">expressions</span><span class="p">,</span> <span class="n">name</span><span class="p">)</span> |
| |
| <span class="n">jcols</span> <span class="o">=</span> <span class="p">[</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">)</span><span class="o">.</span><span class="n">expr</span><span class="p">()</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">cols</span><span class="p">]</span> |
| <span class="n">jfuns</span> <span class="o">=</span> <span class="p">[</span><span class="n">_create_lambda</span><span class="p">(</span><span class="n">f</span><span class="p">)</span> <span class="k">for</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">funs</span><span class="p">]</span> |
| |
| <span class="k">return</span> <span class="n">Column</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">JVMView</span><span class="p">,</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="p">)</span><span class="o">.</span><span class="n">Column</span><span class="p">(</span><span class="n">expr</span><span class="p">(</span><span class="o">*</span><span class="n">jcols</span> <span class="o">+</span> <span class="n">jfuns</span><span class="p">)))</span> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <div class="viewcode-block" id="transform"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.transform.html#pyspark.sql.functions.transform">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">transform</span><span class="p">(</span> |
| <span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">f</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">]],</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns an array of elements after applying a transformation to each element in the input array.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| <span class="sd"> f : function</span> |
| <span class="sd"> a function that is applied to each element of the input array.</span> |
| <span class="sd"> Can take one of the following forms:</span> |
| |
| <span class="sd"> - Unary ``(x: Column) -> Column: ...``</span> |
| <span class="sd"> - Binary ``(x: Column, i: Column) -> Column...``, where the second argument is</span> |
| <span class="sd"> a 0-based index of the element.</span> |
| |
| <span class="sd"> and can use methods of :class:`~pyspark.sql.Column`, functions defined in</span> |
| <span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span> |
| <span class="sd"> Python ``UserDefinedFunctions`` are not supported</span> |
| <span class="sd"> (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a new array of transformed elements.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, [1, 2, 3, 4])], ("key", "values"))</span> |
| <span class="sd"> >>> df.select(transform("values", lambda x: x * 2).alias("doubled")).show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | doubled|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |[2, 4, 6, 8]|</span> |
| <span class="sd"> +------------+</span> |
| |
| <span class="sd"> >>> def alternate(x, i):</span> |
| <span class="sd"> ... return when(i % 2 == 0, x).otherwise(-x)</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> df.select(transform("values", alternate).alias("alternated")).show()</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> | alternated|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> |[1, -2, 3, -4]|</span> |
| <span class="sd"> +--------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">"ArrayTransform"</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div> |
| |
| |
| <div class="viewcode-block" id="exists"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.exists.html#pyspark.sql.functions.exists">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">exists</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns whether a predicate holds for one or more elements in the array.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| <span class="sd"> f : function</span> |
| <span class="sd"> ``(x: Column) -> Column: ...`` returning the Boolean expression.</span> |
| <span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span> |
| <span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span> |
| <span class="sd"> Python ``UserDefinedFunctions`` are not supported</span> |
| <span class="sd"> (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> True if "any" element of an array evaluates to True when passed as an argument to</span> |
| <span class="sd"> given function and False otherwise.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, [1, 2, 3, 4]), (2, [3, -1, 0])],("key", "values"))</span> |
| <span class="sd"> >>> df.select(exists("values", lambda x: x < 0).alias("any_negative")).show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |any_negative|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | false|</span> |
| <span class="sd"> | true|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">"ArrayExists"</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div> |
| |
| |
| <div class="viewcode-block" id="forall"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.forall.html#pyspark.sql.functions.forall">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">forall</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns whether a predicate holds for every element in the array.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| <span class="sd"> f : function</span> |
| <span class="sd"> ``(x: Column) -> Column: ...`` returning the Boolean expression.</span> |
| <span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span> |
| <span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span> |
| <span class="sd"> Python ``UserDefinedFunctions`` are not supported</span> |
| <span class="sd"> (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> True if "all" elements of an array evaluates to True when passed as an argument to</span> |
| <span class="sd"> given function and False otherwise.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(1, ["bar"]), (2, ["foo", "bar"]), (3, ["foobar", "foo"])],</span> |
| <span class="sd"> ... ("key", "values")</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.select(forall("values", lambda x: x.rlike("foo")).alias("all_foo")).show()</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> |all_foo|</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> | false|</span> |
| <span class="sd"> | false|</span> |
| <span class="sd"> | true|</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">"ArrayForAll"</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">filter</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">filter</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <div class="viewcode-block" id="filter"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.filter.html#pyspark.sql.functions.filter">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">filter</span><span class="p">(</span> |
| <span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">f</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">]],</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns an array of elements for which a predicate holds in a given array.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| <span class="sd"> f : function</span> |
| <span class="sd"> A function that returns the Boolean expression.</span> |
| <span class="sd"> Can take one of the following forms:</span> |
| |
| <span class="sd"> - Unary ``(x: Column) -> Column: ...``</span> |
| <span class="sd"> - Binary ``(x: Column, i: Column) -> Column...``, where the second argument is</span> |
| <span class="sd"> a 0-based index of the element.</span> |
| |
| <span class="sd"> and can use methods of :class:`~pyspark.sql.Column`, functions defined in</span> |
| <span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span> |
| <span class="sd"> Python ``UserDefinedFunctions`` are not supported</span> |
| <span class="sd"> (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> filtered array of elements where given function evaluated to True</span> |
| <span class="sd"> when passed as an argument.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(1, ["2018-09-20", "2019-02-03", "2019-07-01", "2020-06-01"])],</span> |
| <span class="sd"> ... ("key", "values")</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> def after_second_quarter(x):</span> |
| <span class="sd"> ... return month(to_date(x)) > 6</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> df.select(</span> |
| <span class="sd"> ... filter("values", after_second_quarter).alias("after_second_quarter")</span> |
| <span class="sd"> ... ).show(truncate=False)</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> |after_second_quarter |</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> |[2018-09-20, 2019-07-01]|</span> |
| <span class="sd"> +------------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">"ArrayFilter"</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div> |
| |
| |
| <div class="viewcode-block" id="aggregate"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.aggregate.html#pyspark.sql.functions.aggregate">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">aggregate</span><span class="p">(</span> |
| <span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">initialValue</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">merge</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span> |
| <span class="n">finish</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Applies a binary operator to an initial state and all elements in the array,</span> |
| <span class="sd"> and reduces this to a single state. The final state is converted into the final result</span> |
| <span class="sd"> by applying a finish function.</span> |
| |
| <span class="sd"> Both functions can use methods of :class:`~pyspark.sql.Column`, functions defined in</span> |
| <span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span> |
| <span class="sd"> Python ``UserDefinedFunctions`` are not supported</span> |
| <span class="sd"> (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| <span class="sd"> initialValue : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> initial value. Name of column or expression</span> |
| <span class="sd"> merge : function</span> |
| <span class="sd"> a binary function ``(acc: Column, x: Column) -> Column...`` returning expression</span> |
| <span class="sd"> of the same type as ``zero``</span> |
| <span class="sd"> finish : function</span> |
| <span class="sd"> an optional unary function ``(x: Column) -> Column: ...``</span> |
| <span class="sd"> used to convert accumulated value.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> final value after aggregate function is applied.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, [20.0, 4.0, 2.0, 6.0, 10.0])], ("id", "values"))</span> |
| <span class="sd"> >>> df.select(aggregate("values", lit(0.0), lambda acc, x: acc + x).alias("sum")).show()</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> | sum|</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> |42.0|</span> |
| <span class="sd"> +----+</span> |
| |
| <span class="sd"> >>> def merge(acc, x):</span> |
| <span class="sd"> ... count = acc.count + 1</span> |
| <span class="sd"> ... sum = acc.sum + x</span> |
| <span class="sd"> ... return struct(count.alias("count"), sum.alias("sum"))</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> df.select(</span> |
| <span class="sd"> ... aggregate(</span> |
| <span class="sd"> ... "values",</span> |
| <span class="sd"> ... struct(lit(0).alias("count"), lit(0.0).alias("sum")),</span> |
| <span class="sd"> ... merge,</span> |
| <span class="sd"> ... lambda acc: acc.sum / acc.count,</span> |
| <span class="sd"> ... ).alias("mean")</span> |
| <span class="sd"> ... ).show()</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> |mean|</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> | 8.4|</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">finish</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">"ArrayAggregate"</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">,</span> <span class="n">initialValue</span><span class="p">],</span> <span class="p">[</span><span class="n">merge</span><span class="p">,</span> <span class="n">finish</span><span class="p">])</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">"ArrayAggregate"</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">,</span> <span class="n">initialValue</span><span class="p">],</span> <span class="p">[</span><span class="n">merge</span><span class="p">])</span></div> |
| |
| |
| <div class="viewcode-block" id="reduce"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.reduce.html#pyspark.sql.functions.reduce">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">reduce</span><span class="p">(</span> |
| <span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">initialValue</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">merge</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span> |
| <span class="n">finish</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Applies a binary operator to an initial state and all elements in the array,</span> |
| <span class="sd"> and reduces this to a single state. The final state is converted into the final result</span> |
| <span class="sd"> by applying a finish function.</span> |
| |
| <span class="sd"> Both functions can use methods of :class:`~pyspark.sql.Column`, functions defined in</span> |
| <span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span> |
| <span class="sd"> Python ``UserDefinedFunctions`` are not supported</span> |
| <span class="sd"> (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| <span class="sd"> initialValue : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> initial value. Name of column or expression</span> |
| <span class="sd"> merge : function</span> |
| <span class="sd"> a binary function ``(acc: Column, x: Column) -> Column...`` returning expression</span> |
| <span class="sd"> of the same type as ``zero``</span> |
| <span class="sd"> finish : function</span> |
| <span class="sd"> an optional unary function ``(x: Column) -> Column: ...``</span> |
| <span class="sd"> used to convert accumulated value.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> final value after aggregate function is applied.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, [20.0, 4.0, 2.0, 6.0, 10.0])], ("id", "values"))</span> |
| <span class="sd"> >>> df.select(reduce("values", lit(0.0), lambda acc, x: acc + x).alias("sum")).show()</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> | sum|</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> |42.0|</span> |
| <span class="sd"> +----+</span> |
| |
| <span class="sd"> >>> def merge(acc, x):</span> |
| <span class="sd"> ... count = acc.count + 1</span> |
| <span class="sd"> ... sum = acc.sum + x</span> |
| <span class="sd"> ... return struct(count.alias("count"), sum.alias("sum"))</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> df.select(</span> |
| <span class="sd"> ... reduce(</span> |
| <span class="sd"> ... "values",</span> |
| <span class="sd"> ... struct(lit(0).alias("count"), lit(0.0).alias("sum")),</span> |
| <span class="sd"> ... merge,</span> |
| <span class="sd"> ... lambda acc: acc.sum / acc.count,</span> |
| <span class="sd"> ... ).alias("mean")</span> |
| <span class="sd"> ... ).show()</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> |mean|</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> | 8.4|</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">finish</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">"ArrayAggregate"</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">,</span> <span class="n">initialValue</span><span class="p">],</span> <span class="p">[</span><span class="n">merge</span><span class="p">,</span> <span class="n">finish</span><span class="p">])</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">"ArrayAggregate"</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">,</span> <span class="n">initialValue</span><span class="p">],</span> <span class="p">[</span><span class="n">merge</span><span class="p">])</span></div> |
| |
| |
| <div class="viewcode-block" id="zip_with"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.zip_with.html#pyspark.sql.functions.zip_with">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">zip_with</span><span class="p">(</span> |
| <span class="n">left</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">right</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Merge two given arrays, element-wise, into a single array using a function.</span> |
| <span class="sd"> If one array is shorter, nulls are appended at the end to match the length of the longer</span> |
| <span class="sd"> array, before applying the function.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> left : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of the first column or expression</span> |
| <span class="sd"> right : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of the second column or expression</span> |
| <span class="sd"> f : function</span> |
| <span class="sd"> a binary function ``(x1: Column, x2: Column) -> Column...``</span> |
| <span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span> |
| <span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span> |
| <span class="sd"> Python ``UserDefinedFunctions`` are not supported</span> |
| <span class="sd"> (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> array of calculated values derived by applying given function to each pair of arguments.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, [1, 3, 5, 8], [0, 2, 4, 6])], ("id", "xs", "ys"))</span> |
| <span class="sd"> >>> df.select(zip_with("xs", "ys", lambda x, y: x ** y).alias("powers")).show(truncate=False)</span> |
| <span class="sd"> +---------------------------+</span> |
| <span class="sd"> |powers |</span> |
| <span class="sd"> +---------------------------+</span> |
| <span class="sd"> |[1.0, 9.0, 625.0, 262144.0]|</span> |
| <span class="sd"> +---------------------------+</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(1, ["foo", "bar"], [1, 2, 3])], ("id", "xs", "ys"))</span> |
| <span class="sd"> >>> df.select(zip_with("xs", "ys", lambda x, y: concat_ws("_", x, y)).alias("xs_ys")).show()</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> | xs_ys|</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> |[foo_1, bar_2, 3]|</span> |
| <span class="sd"> +-----------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">"ZipWith"</span><span class="p">,</span> <span class="p">[</span><span class="n">left</span><span class="p">,</span> <span class="n">right</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div> |
| |
| |
| <div class="viewcode-block" id="transform_keys"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.transform_keys.html#pyspark.sql.functions.transform_keys">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">transform_keys</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Applies a function to every key-value pair in a map and returns</span> |
| <span class="sd"> a map with the results of those applications as the new keys for the pairs.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| <span class="sd"> f : function</span> |
| <span class="sd"> a binary function ``(k: Column, v: Column) -> Column...``</span> |
| <span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span> |
| <span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span> |
| <span class="sd"> Python ``UserDefinedFunctions`` are not supported</span> |
| <span class="sd"> (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a new map of enties where new keys were calculated by applying given function to</span> |
| <span class="sd"> each key value argument.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, {"foo": -2.0, "bar": 2.0})], ("id", "data"))</span> |
| <span class="sd"> >>> row = df.select(transform_keys(</span> |
| <span class="sd"> ... "data", lambda k, _: upper(k)).alias("data_upper")</span> |
| <span class="sd"> ... ).head()</span> |
| <span class="sd"> >>> sorted(row["data_upper"].items())</span> |
| <span class="sd"> [('BAR', 2.0), ('FOO', -2.0)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">"TransformKeys"</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div> |
| |
| |
| <div class="viewcode-block" id="transform_values"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.transform_values.html#pyspark.sql.functions.transform_values">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">transform_values</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Applies a function to every key-value pair in a map and returns</span> |
| <span class="sd"> a map with the results of those applications as the new values for the pairs.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| <span class="sd"> f : function</span> |
| <span class="sd"> a binary function ``(k: Column, v: Column) -> Column...``</span> |
| <span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span> |
| <span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span> |
| <span class="sd"> Python ``UserDefinedFunctions`` are not supported</span> |
| <span class="sd"> (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a new map of enties where new values were calculated by applying given function to</span> |
| <span class="sd"> each key value argument.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, {"IT": 10.0, "SALES": 2.0, "OPS": 24.0})], ("id", "data"))</span> |
| <span class="sd"> >>> row = df.select(transform_values(</span> |
| <span class="sd"> ... "data", lambda k, v: when(k.isin("IT", "OPS"), v + 10.0).otherwise(v)</span> |
| <span class="sd"> ... ).alias("new_data")).head()</span> |
| <span class="sd"> >>> sorted(row["new_data"].items())</span> |
| <span class="sd"> [('IT', 20.0), ('OPS', 34.0), ('SALES', 2.0)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">"TransformValues"</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div> |
| |
| |
| <div class="viewcode-block" id="map_filter"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.map_filter.html#pyspark.sql.functions.map_filter">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">map_filter</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a map whose key-value pairs satisfy a predicate.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of column or expression</span> |
| <span class="sd"> f : function</span> |
| <span class="sd"> a binary function ``(k: Column, v: Column) -> Column...``</span> |
| <span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span> |
| <span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span> |
| <span class="sd"> Python ``UserDefinedFunctions`` are not supported</span> |
| <span class="sd"> (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> filtered map.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, {"foo": 42.0, "bar": 1.0, "baz": 32.0})], ("id", "data"))</span> |
| <span class="sd"> >>> row = df.select(map_filter(</span> |
| <span class="sd"> ... "data", lambda _, v: v > 30.0).alias("data_filtered")</span> |
| <span class="sd"> ... ).head()</span> |
| <span class="sd"> >>> sorted(row["data_filtered"].items())</span> |
| <span class="sd"> [('baz', 32.0), ('foo', 42.0)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">"MapFilter"</span><span class="p">,</span> <span class="p">[</span><span class="n">col</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div> |
| |
| |
| <div class="viewcode-block" id="map_zip_with"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.map_zip_with.html#pyspark.sql.functions.map_zip_with">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">map_zip_with</span><span class="p">(</span> |
| <span class="n">col1</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">col2</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Merge two given maps, key-wise into a single map using a function.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of the first column or expression</span> |
| <span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> name of the second column or expression</span> |
| <span class="sd"> f : function</span> |
| <span class="sd"> a ternary function ``(k: Column, v1: Column, v2: Column) -> Column...``</span> |
| <span class="sd"> Can use methods of :class:`~pyspark.sql.Column`, functions defined in</span> |
| <span class="sd"> :py:mod:`pyspark.sql.functions` and Scala ``UserDefinedFunctions``.</span> |
| <span class="sd"> Python ``UserDefinedFunctions`` are not supported</span> |
| <span class="sd"> (`SPARK-27052 <https://issues.apache.org/jira/browse/SPARK-27052>`__).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> zipped map where entries are calculated by applying given function to each</span> |
| <span class="sd"> pair of arguments.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... (1, {"IT": 24.0, "SALES": 12.00}, {"IT": 2.0, "SALES": 1.4})],</span> |
| <span class="sd"> ... ("id", "base", "ratio")</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> row = df.select(map_zip_with(</span> |
| <span class="sd"> ... "base", "ratio", lambda k, v1, v2: round(v1 * v2, 2)).alias("updated_data")</span> |
| <span class="sd"> ... ).head()</span> |
| <span class="sd"> >>> sorted(row["updated_data"].items())</span> |
| <span class="sd"> [('IT', 48.0), ('SALES', 16.8)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_higher_order_function</span><span class="p">(</span><span class="s2">"MapZipWith"</span><span class="p">,</span> <span class="p">[</span><span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">],</span> <span class="p">[</span><span class="n">f</span><span class="p">])</span></div> |
| |
| |
| <div class="viewcode-block" id="str_to_map"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.str_to_map.html#pyspark.sql.functions.str_to_map">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">str_to_map</span><span class="p">(</span> |
| <span class="n">text</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">pairDelim</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">keyValueDelim</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Creates a map after splitting the text into key/value pairs using delimiters.</span> |
| <span class="sd"> Both `pairDelim` and `keyValueDelim` are treated as regular expressions.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> text : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> Input column or strings.</span> |
| <span class="sd"> pairDelim : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> delimiter to use to split pair.</span> |
| <span class="sd"> keyValueDelim : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> delimiter to use to split key/value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("a:1,b:2,c:3",)], ["e"])</span> |
| <span class="sd"> >>> df.select(str_to_map(df.e, lit(","), lit(":")).alias('r')).collect()</span> |
| <span class="sd"> [Row(r={'a': '1', 'b': '2', 'c': '3'})]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([("a:1,b:2,c:3",)], ["e"])</span> |
| <span class="sd"> >>> df.select(str_to_map(df.e, lit(",")).alias('r')).collect()</span> |
| <span class="sd"> [Row(r={'a': '1', 'b': '2', 'c': '3'})]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([("a:1,b:2,c:3",)], ["e"])</span> |
| <span class="sd"> >>> df.select(str_to_map(df.e).alias('r')).collect()</span> |
| <span class="sd"> [Row(r={'a': '1', 'b': '2', 'c': '3'})]</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">pairDelim</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">pairDelim</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">","</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">keyValueDelim</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">keyValueDelim</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">":"</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"str_to_map"</span><span class="p">,</span> <span class="n">text</span><span class="p">,</span> <span class="n">pairDelim</span><span class="p">,</span> <span class="n">keyValueDelim</span><span class="p">)</span></div> |
| |
| |
| <span class="c1"># ---------------------- Partition transform functions --------------------------------</span> |
| |
| |
| <div class="viewcode-block" id="years"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.years.html#pyspark.sql.functions.years">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">years</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Partition transform function: A transform for timestamps and dates</span> |
| <span class="sd"> to partition data into years.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target date or timestamp column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> data partitioned by years.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.writeTo("catalog.db.table").partitionedBy( # doctest: +SKIP</span> |
| <span class="sd"> ... years("ts")</span> |
| <span class="sd"> ... ).createOrReplace()</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This function can be used only in combination with</span> |
| <span class="sd"> :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`</span> |
| <span class="sd"> method of the `DataFrameWriterV2`.</span> |
| |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"years"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="months"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.months.html#pyspark.sql.functions.months">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">months</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Partition transform function: A transform for timestamps and dates</span> |
| <span class="sd"> to partition data into months.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target date or timestamp column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> data partitioned by months.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.writeTo("catalog.db.table").partitionedBy(</span> |
| <span class="sd"> ... months("ts")</span> |
| <span class="sd"> ... ).createOrReplace() # doctest: +SKIP</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This function can be used only in combination with</span> |
| <span class="sd"> :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`</span> |
| <span class="sd"> method of the `DataFrameWriterV2`.</span> |
| |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"months"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="days"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.days.html#pyspark.sql.functions.days">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">days</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Partition transform function: A transform for timestamps and dates</span> |
| <span class="sd"> to partition data into days.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target date or timestamp column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> data partitioned by days.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.writeTo("catalog.db.table").partitionedBy( # doctest: +SKIP</span> |
| <span class="sd"> ... days("ts")</span> |
| <span class="sd"> ... ).createOrReplace()</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This function can be used only in combination with</span> |
| <span class="sd"> :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`</span> |
| <span class="sd"> method of the `DataFrameWriterV2`.</span> |
| |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"days"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="hours"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.hours.html#pyspark.sql.functions.hours">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">hours</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Partition transform function: A transform for timestamps</span> |
| <span class="sd"> to partition data into hours.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target date or timestamp column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> data partitioned by hours.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.writeTo("catalog.db.table").partitionedBy( # doctest: +SKIP</span> |
| <span class="sd"> ... hours("ts")</span> |
| <span class="sd"> ... ).createOrReplace()</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This function can be used only in combination with</span> |
| <span class="sd"> :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`</span> |
| <span class="sd"> method of the `DataFrameWriterV2`.</span> |
| |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"hours"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="convert_timezone"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.convert_timezone.html#pyspark.sql.functions.convert_timezone">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">convert_timezone</span><span class="p">(</span> |
| <span class="n">sourceTz</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Column</span><span class="p">],</span> <span class="n">targetTz</span><span class="p">:</span> <span class="n">Column</span><span class="p">,</span> <span class="n">sourceTs</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Converts the timestamp without time zone `sourceTs`</span> |
| <span class="sd"> from the `sourceTz` time zone to `targetTz`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> sourceTz : :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the time zone for the input timestamp. If it is missed,</span> |
| <span class="sd"> the current session time zone is used as the source time zone.</span> |
| <span class="sd"> targetTz : :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> the time zone to which the input timestamp should be converted.</span> |
| <span class="sd"> sourceTs : :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> a timestamp without time zone.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> timestamp for converted time zone.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([('2015-04-08',)], ['dt'])</span> |
| <span class="sd"> >>> df.select(convert_timezone( # doctest: +SKIP</span> |
| <span class="sd"> ... None, lit('Asia/Hong_Kong'), 'dt').alias('ts')</span> |
| <span class="sd"> ... ).show()</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> | ts|</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> |2015-04-08 00:00:00|</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> >>> df.select(convert_timezone(</span> |
| <span class="sd"> ... lit('America/Los_Angeles'), lit('Asia/Hong_Kong'), 'dt').alias('ts')</span> |
| <span class="sd"> ... ).show()</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> | ts|</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> |2015-04-08 15:00:00|</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">sourceTz</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"convert_timezone"</span><span class="p">,</span> <span class="n">targetTz</span><span class="p">,</span> <span class="n">sourceTs</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"convert_timezone"</span><span class="p">,</span> <span class="n">sourceTz</span><span class="p">,</span> <span class="n">targetTz</span><span class="p">,</span> <span class="n">sourceTs</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="make_dt_interval"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.make_dt_interval.html#pyspark.sql.functions.make_dt_interval">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">make_dt_interval</span><span class="p">(</span> |
| <span class="n">days</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">hours</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">mins</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">secs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Make DayTimeIntervalType duration from days, hours, mins and secs.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> days : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the number of days, positive or negative</span> |
| <span class="sd"> hours : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the number of hours, positive or negative</span> |
| <span class="sd"> mins : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the number of minutes, positive or negative</span> |
| <span class="sd"> secs : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the number of seconds with the fractional part in microsecond precision.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[1, 12, 30, 01.001001]],</span> |
| <span class="sd"> ... ["day", "hour", "min", "sec"])</span> |
| <span class="sd"> >>> df.select(make_dt_interval(</span> |
| <span class="sd"> ... df.day, df.hour, df.min, df.sec).alias('r')</span> |
| <span class="sd"> ... ).show(truncate=False)</span> |
| <span class="sd"> +------------------------------------------+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +------------------------------------------+</span> |
| <span class="sd"> |INTERVAL '1 12:30:01.001001' DAY TO SECOND|</span> |
| <span class="sd"> +------------------------------------------+</span> |
| |
| <span class="sd"> >>> df.select(make_dt_interval(</span> |
| <span class="sd"> ... df.day, df.hour, df.min).alias('r')</span> |
| <span class="sd"> ... ).show(truncate=False)</span> |
| <span class="sd"> +-----------------------------------+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +-----------------------------------+</span> |
| <span class="sd"> |INTERVAL '1 12:30:00' DAY TO SECOND|</span> |
| <span class="sd"> +-----------------------------------+</span> |
| |
| <span class="sd"> >>> df.select(make_dt_interval(</span> |
| <span class="sd"> ... df.day, df.hour).alias('r')</span> |
| <span class="sd"> ... ).show(truncate=False)</span> |
| <span class="sd"> +-----------------------------------+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +-----------------------------------+</span> |
| <span class="sd"> |INTERVAL '1 12:00:00' DAY TO SECOND|</span> |
| <span class="sd"> +-----------------------------------+</span> |
| |
| <span class="sd"> >>> df.select(make_dt_interval(df.day).alias('r')).show(truncate=False)</span> |
| <span class="sd"> +-----------------------------------+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +-----------------------------------+</span> |
| <span class="sd"> |INTERVAL '1 00:00:00' DAY TO SECOND|</span> |
| <span class="sd"> +-----------------------------------+</span> |
| |
| <span class="sd"> >>> df.select(make_dt_interval().alias('r')).show(truncate=False)</span> |
| <span class="sd"> +-----------------------------------+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +-----------------------------------+</span> |
| <span class="sd"> |INTERVAL '0 00:00:00' DAY TO SECOND|</span> |
| <span class="sd"> +-----------------------------------+</span> |
| <span class="sd"> """</span> |
| <span class="n">_days</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">days</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">days</span> |
| <span class="n">_hours</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">hours</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">hours</span> |
| <span class="n">_mins</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">mins</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">mins</span> |
| <span class="n">_secs</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">decimal</span><span class="o">.</span><span class="n">Decimal</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> <span class="k">if</span> <span class="n">secs</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">secs</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"make_dt_interval"</span><span class="p">,</span> <span class="n">_days</span><span class="p">,</span> <span class="n">_hours</span><span class="p">,</span> <span class="n">_mins</span><span class="p">,</span> <span class="n">_secs</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="make_interval"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.make_interval.html#pyspark.sql.functions.make_interval">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">make_interval</span><span class="p">(</span> |
| <span class="n">years</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">months</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">weeks</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">days</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">hours</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">mins</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">secs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Make interval from years, months, weeks, days, hours, mins and secs.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> years : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the number of years, positive or negative</span> |
| <span class="sd"> months : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the number of months, positive or negative</span> |
| <span class="sd"> weeks : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the number of weeks, positive or negative</span> |
| <span class="sd"> days : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the number of days, positive or negative</span> |
| <span class="sd"> hours : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the number of hours, positive or negative</span> |
| <span class="sd"> mins : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the number of minutes, positive or negative</span> |
| <span class="sd"> secs : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the number of seconds with the fractional part in microsecond precision.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[100, 11, 1, 1, 12, 30, 01.001001]],</span> |
| <span class="sd"> ... ["year", "month", "week", "day", "hour", "min", "sec"])</span> |
| <span class="sd"> >>> df.select(make_interval(</span> |
| <span class="sd"> ... df.year, df.month, df.week, df.day, df.hour, df.min, df.sec).alias('r')</span> |
| <span class="sd"> ... ).show(truncate=False)</span> |
| <span class="sd"> +---------------------------------------------------------------+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +---------------------------------------------------------------+</span> |
| <span class="sd"> |100 years 11 months 8 days 12 hours 30 minutes 1.001001 seconds|</span> |
| <span class="sd"> +---------------------------------------------------------------+</span> |
| |
| <span class="sd"> >>> df.select(make_interval(</span> |
| <span class="sd"> ... df.year, df.month, df.week, df.day, df.hour, df.min).alias('r')</span> |
| <span class="sd"> ... ).show(truncate=False)</span> |
| <span class="sd"> +----------------------------------------------+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +----------------------------------------------+</span> |
| <span class="sd"> |100 years 11 months 8 days 12 hours 30 minutes|</span> |
| <span class="sd"> +----------------------------------------------+</span> |
| |
| <span class="sd"> >>> df.select(make_interval(</span> |
| <span class="sd"> ... df.year, df.month, df.week, df.day, df.hour).alias('r')</span> |
| <span class="sd"> ... ).show(truncate=False)</span> |
| <span class="sd"> +-----------------------------------+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +-----------------------------------+</span> |
| <span class="sd"> |100 years 11 months 8 days 12 hours|</span> |
| <span class="sd"> +-----------------------------------+</span> |
| |
| <span class="sd"> >>> df.select(make_interval(</span> |
| <span class="sd"> ... df.year, df.month, df.week, df.day).alias('r')</span> |
| <span class="sd"> ... ).show(truncate=False)</span> |
| <span class="sd"> +--------------------------+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +--------------------------+</span> |
| <span class="sd"> |100 years 11 months 8 days|</span> |
| <span class="sd"> +--------------------------+</span> |
| |
| <span class="sd"> >>> df.select(make_interval(</span> |
| <span class="sd"> ... df.year, df.month, df.week).alias('r')</span> |
| <span class="sd"> ... ).show(truncate=False)</span> |
| <span class="sd"> +--------------------------+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +--------------------------+</span> |
| <span class="sd"> |100 years 11 months 7 days|</span> |
| <span class="sd"> +--------------------------+</span> |
| |
| <span class="sd"> >>> df.select(make_interval(df.year, df.month).alias('r')).show(truncate=False)</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +-------------------+</span> |
| <span class="sd"> |100 years 11 months|</span> |
| <span class="sd"> +-------------------+</span> |
| |
| <span class="sd"> >>> df.select(make_interval(df.year).alias('r')).show(truncate=False)</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> |100 years|</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> """</span> |
| <span class="n">_years</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">years</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">years</span> |
| <span class="n">_months</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">months</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">months</span> |
| <span class="n">_weeks</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">weeks</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">weeks</span> |
| <span class="n">_days</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">days</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">days</span> |
| <span class="n">_hours</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">hours</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">hours</span> |
| <span class="n">_mins</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">mins</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">mins</span> |
| <span class="n">_secs</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">decimal</span><span class="o">.</span><span class="n">Decimal</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> <span class="k">if</span> <span class="n">secs</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">secs</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span> |
| <span class="s2">"make_interval"</span><span class="p">,</span> <span class="n">_years</span><span class="p">,</span> <span class="n">_months</span><span class="p">,</span> <span class="n">_weeks</span><span class="p">,</span> <span class="n">_days</span><span class="p">,</span> <span class="n">_hours</span><span class="p">,</span> <span class="n">_mins</span><span class="p">,</span> <span class="n">_secs</span> |
| <span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="make_timestamp"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.make_timestamp.html#pyspark.sql.functions.make_timestamp">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">make_timestamp</span><span class="p">(</span> |
| <span class="n">years</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">months</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">days</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">hours</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">mins</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">secs</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">timezone</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Create timestamp from years, months, days, hours, mins, secs and timezone fields.</span> |
| <span class="sd"> The result data type is consistent with the value of configuration `spark.sql.timestampType`.</span> |
| <span class="sd"> If the configuration `spark.sql.ansi.enabled` is false, the function returns NULL</span> |
| <span class="sd"> on invalid inputs. Otherwise, it will throw an error instead.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> years : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the year to represent, from 1 to 9999</span> |
| <span class="sd"> months : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the month-of-year to represent, from 1 (January) to 12 (December)</span> |
| <span class="sd"> days : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the day-of-month to represent, from 1 to 31</span> |
| <span class="sd"> hours : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the hour-of-day to represent, from 0 to 23</span> |
| <span class="sd"> mins : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the minute-of-hour to represent, from 0 to 59</span> |
| <span class="sd"> secs : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the second-of-minute and its micro-fraction to represent, from 0 to 60.</span> |
| <span class="sd"> The value can be either an integer like 13 , or a fraction like 13.123.</span> |
| <span class="sd"> If the sec argument equals to 60, the seconds field is set</span> |
| <span class="sd"> to 0 and 1 minute is added to the final timestamp.</span> |
| <span class="sd"> timezone : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the time zone identifier. For example, CET, UTC and etc.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']],</span> |
| <span class="sd"> ... ["year", "month", "day", "hour", "min", "sec", "timezone"])</span> |
| <span class="sd"> >>> df.select(make_timestamp(</span> |
| <span class="sd"> ... df.year, df.month, df.day, df.hour, df.min, df.sec, df.timezone).alias('r')</span> |
| <span class="sd"> ... ).show(truncate=False)</span> |
| <span class="sd"> +-----------------------+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +-----------------------+</span> |
| <span class="sd"> |2014-12-27 21:30:45.887|</span> |
| <span class="sd"> +-----------------------+</span> |
| |
| <span class="sd"> >>> df.select(make_timestamp(</span> |
| <span class="sd"> ... df.year, df.month, df.day, df.hour, df.min, df.sec).alias('r')</span> |
| <span class="sd"> ... ).show(truncate=False)</span> |
| <span class="sd"> +-----------------------+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +-----------------------+</span> |
| <span class="sd"> |2014-12-28 06:30:45.887|</span> |
| <span class="sd"> +-----------------------+</span> |
| <span class="sd"> >>> spark.conf.unset("spark.sql.session.timeZone")</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">timezone</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span> |
| <span class="s2">"make_timestamp"</span><span class="p">,</span> <span class="n">years</span><span class="p">,</span> <span class="n">months</span><span class="p">,</span> <span class="n">days</span><span class="p">,</span> <span class="n">hours</span><span class="p">,</span> <span class="n">mins</span><span class="p">,</span> <span class="n">secs</span><span class="p">,</span> <span class="n">timezone</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span> |
| <span class="s2">"make_timestamp"</span><span class="p">,</span> <span class="n">years</span><span class="p">,</span> <span class="n">months</span><span class="p">,</span> <span class="n">days</span><span class="p">,</span> <span class="n">hours</span><span class="p">,</span> <span class="n">mins</span><span class="p">,</span> <span class="n">secs</span> |
| <span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="make_timestamp_ltz"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.make_timestamp_ltz.html#pyspark.sql.functions.make_timestamp_ltz">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">make_timestamp_ltz</span><span class="p">(</span> |
| <span class="n">years</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">months</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">days</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">hours</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">mins</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">secs</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">timezone</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Create the current timestamp with local time zone from years, months, days, hours, mins,</span> |
| <span class="sd"> secs and timezone fields. If the configuration `spark.sql.ansi.enabled` is false,</span> |
| <span class="sd"> the function returns NULL on invalid inputs. Otherwise, it will throw an error instead.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> years : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the year to represent, from 1 to 9999</span> |
| <span class="sd"> months : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the month-of-year to represent, from 1 (January) to 12 (December)</span> |
| <span class="sd"> days : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the day-of-month to represent, from 1 to 31</span> |
| <span class="sd"> hours : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the hour-of-day to represent, from 0 to 23</span> |
| <span class="sd"> mins : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the minute-of-hour to represent, from 0 to 59</span> |
| <span class="sd"> secs : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the second-of-minute and its micro-fraction to represent, from 0 to 60.</span> |
| <span class="sd"> The value can be either an integer like 13 , or a fraction like 13.123.</span> |
| <span class="sd"> If the sec argument equals to 60, the seconds field is set</span> |
| <span class="sd"> to 0 and 1 minute is added to the final timestamp.</span> |
| <span class="sd"> timezone : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the time zone identifier. For example, CET, UTC and etc.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887, 'CET']],</span> |
| <span class="sd"> ... ["year", "month", "day", "hour", "min", "sec", "timezone"])</span> |
| <span class="sd"> >>> df.select(sf.make_timestamp_ltz(</span> |
| <span class="sd"> ... df.year, df.month, df.day, df.hour, df.min, df.sec, df.timezone)</span> |
| <span class="sd"> ... ).show(truncate=False)</span> |
| <span class="sd"> +--------------------------------------------------------------+</span> |
| <span class="sd"> |make_timestamp_ltz(year, month, day, hour, min, sec, timezone)|</span> |
| <span class="sd"> +--------------------------------------------------------------+</span> |
| <span class="sd"> |2014-12-27 21:30:45.887 |</span> |
| <span class="sd"> +--------------------------------------------------------------+</span> |
| |
| <span class="sd"> >>> df.select(sf.make_timestamp_ltz(</span> |
| <span class="sd"> ... df.year, df.month, df.day, df.hour, df.min, df.sec)</span> |
| <span class="sd"> ... ).show(truncate=False)</span> |
| <span class="sd"> +----------------------------------------------------+</span> |
| <span class="sd"> |make_timestamp_ltz(year, month, day, hour, min, sec)|</span> |
| <span class="sd"> +----------------------------------------------------+</span> |
| <span class="sd"> |2014-12-28 06:30:45.887 |</span> |
| <span class="sd"> +----------------------------------------------------+</span> |
| <span class="sd"> >>> spark.conf.unset("spark.sql.session.timeZone")</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">timezone</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span> |
| <span class="s2">"make_timestamp_ltz"</span><span class="p">,</span> <span class="n">years</span><span class="p">,</span> <span class="n">months</span><span class="p">,</span> <span class="n">days</span><span class="p">,</span> <span class="n">hours</span><span class="p">,</span> <span class="n">mins</span><span class="p">,</span> <span class="n">secs</span><span class="p">,</span> <span class="n">timezone</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span> |
| <span class="s2">"make_timestamp_ltz"</span><span class="p">,</span> <span class="n">years</span><span class="p">,</span> <span class="n">months</span><span class="p">,</span> <span class="n">days</span><span class="p">,</span> <span class="n">hours</span><span class="p">,</span> <span class="n">mins</span><span class="p">,</span> <span class="n">secs</span> |
| <span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="make_timestamp_ntz"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.make_timestamp_ntz.html#pyspark.sql.functions.make_timestamp_ntz">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">make_timestamp_ntz</span><span class="p">(</span> |
| <span class="n">years</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">months</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">days</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">hours</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">mins</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">secs</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Create local date-time from years, months, days, hours, mins, secs fields.</span> |
| <span class="sd"> If the configuration `spark.sql.ansi.enabled` is false, the function returns NULL</span> |
| <span class="sd"> on invalid inputs. Otherwise, it will throw an error instead.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> years : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the year to represent, from 1 to 9999</span> |
| <span class="sd"> months : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the month-of-year to represent, from 1 (January) to 12 (December)</span> |
| <span class="sd"> days : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the day-of-month to represent, from 1 to 31</span> |
| <span class="sd"> hours : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the hour-of-day to represent, from 0 to 23</span> |
| <span class="sd"> mins : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the minute-of-hour to represent, from 0 to 59</span> |
| <span class="sd"> secs : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the second-of-minute and its micro-fraction to represent, from 0 to 60.</span> |
| <span class="sd"> The value can be either an integer like 13 , or a fraction like 13.123.</span> |
| <span class="sd"> If the sec argument equals to 60, the seconds field is set</span> |
| <span class="sd"> to 0 and 1 minute is added to the final timestamp.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[2014, 12, 28, 6, 30, 45.887]],</span> |
| <span class="sd"> ... ["year", "month", "day", "hour", "min", "sec"])</span> |
| <span class="sd"> >>> df.select(sf.make_timestamp_ntz(</span> |
| <span class="sd"> ... df.year, df.month, df.day, df.hour, df.min, df.sec)</span> |
| <span class="sd"> ... ).show(truncate=False)</span> |
| <span class="sd"> +----------------------------------------------------+</span> |
| <span class="sd"> |make_timestamp_ntz(year, month, day, hour, min, sec)|</span> |
| <span class="sd"> +----------------------------------------------------+</span> |
| <span class="sd"> |2014-12-28 06:30:45.887 |</span> |
| <span class="sd"> +----------------------------------------------------+</span> |
| <span class="sd"> >>> spark.conf.unset("spark.sql.session.timeZone")</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span> |
| <span class="s2">"make_timestamp_ntz"</span><span class="p">,</span> <span class="n">years</span><span class="p">,</span> <span class="n">months</span><span class="p">,</span> <span class="n">days</span><span class="p">,</span> <span class="n">hours</span><span class="p">,</span> <span class="n">mins</span><span class="p">,</span> <span class="n">secs</span> |
| <span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="make_ym_interval"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.make_ym_interval.html#pyspark.sql.functions.make_ym_interval">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">make_ym_interval</span><span class="p">(</span> |
| <span class="n">years</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">months</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Make year-month interval from years, months.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> years : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the number of years, positive or negative</span> |
| <span class="sd"> months : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the number of months, positive or negative</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")</span> |
| <span class="sd"> >>> df = spark.createDataFrame([[2014, 12]], ["year", "month"])</span> |
| <span class="sd"> >>> df.select(make_ym_interval(df.year, df.month).alias('r')).show(truncate=False)</span> |
| <span class="sd"> +-------------------------------+</span> |
| <span class="sd"> |r |</span> |
| <span class="sd"> +-------------------------------+</span> |
| <span class="sd"> |INTERVAL '2015-0' YEAR TO MONTH|</span> |
| <span class="sd"> +-------------------------------+</span> |
| <span class="sd"> >>> spark.conf.unset("spark.sql.session.timeZone")</span> |
| <span class="sd"> """</span> |
| <span class="n">_years</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">years</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">years</span> |
| <span class="n">_months</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> <span class="k">if</span> <span class="n">months</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">months</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"make_ym_interval"</span><span class="p">,</span> <span class="n">_years</span><span class="p">,</span> <span class="n">_months</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="bucket"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bucket.html#pyspark.sql.functions.bucket">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">bucket</span><span class="p">(</span><span class="n">numBuckets</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="nb">int</span><span class="p">],</span> <span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Partition transform function: A transform for any type that partitions</span> |
| <span class="sd"> by a hash of the input column.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.writeTo("catalog.db.table").partitionedBy( # doctest: +SKIP</span> |
| <span class="sd"> ... bucket(42, "ts")</span> |
| <span class="sd"> ... ).createOrReplace()</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> target date or timestamp column to work on.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> data partitioned by given columns.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This function can be used only in combination with</span> |
| <span class="sd"> :py:meth:`~pyspark.sql.readwriter.DataFrameWriterV2.partitionedBy`</span> |
| <span class="sd"> method of the `DataFrameWriterV2`.</span> |
| |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">numBuckets</span><span class="p">,</span> <span class="p">(</span><span class="nb">int</span><span class="p">,</span> <span class="n">Column</span><span class="p">)):</span> |
| <span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span> |
| <span class="n">error_class</span><span class="o">=</span><span class="s2">"NOT_COLUMN_OR_INT"</span><span class="p">,</span> |
| <span class="n">message_parameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"arg_name"</span><span class="p">:</span> <span class="s2">"numBuckets"</span><span class="p">,</span> <span class="s2">"arg_type"</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">numBuckets</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="n">get_active_spark_context</span><span class="p">()</span> |
| <span class="n">numBuckets</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">_create_column_from_literal</span><span class="p">(</span><span class="n">numBuckets</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">numBuckets</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> |
| <span class="k">else</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">numBuckets</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"bucket"</span><span class="p">,</span> <span class="n">numBuckets</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="call_udf"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.call_udf.html#pyspark.sql.functions.call_udf">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">call_udf</span><span class="p">(</span><span class="n">udfName</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Call an user-defined function.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> udfName : str</span> |
| <span class="sd"> name of the user defined function (UDF)</span> |
| <span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column names or :class:`~pyspark.sql.Column`\\s to be used in the UDF</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> result of executed udf.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.functions import call_udf, col</span> |
| <span class="sd"> >>> from pyspark.sql.types import IntegerType, StringType</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, "a"),(2, "b"), (3, "c")],["id", "name"])</span> |
| <span class="sd"> >>> _ = spark.udf.register("intX2", lambda i: i * 2, IntegerType())</span> |
| <span class="sd"> >>> df.select(call_udf("intX2", "id")).show()</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> |intX2(id)|</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> | 4|</span> |
| <span class="sd"> | 6|</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> >>> _ = spark.udf.register("strX2", lambda s: s * 2, StringType())</span> |
| <span class="sd"> >>> df.select(call_udf("strX2", col("name"))).show()</span> |
| <span class="sd"> +-----------+</span> |
| <span class="sd"> |strX2(name)|</span> |
| <span class="sd"> +-----------+</span> |
| <span class="sd"> | aa|</span> |
| <span class="sd"> | bb|</span> |
| <span class="sd"> | cc|</span> |
| <span class="sd"> +-----------+</span> |
| <span class="sd"> """</span> |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"call_udf"</span><span class="p">,</span> <span class="n">udfName</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="call_function"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.call_function.html#pyspark.sql.functions.call_function">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">call_function</span><span class="p">(</span><span class="n">funcName</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Call a SQL function.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> funcName : str</span> |
| <span class="sd"> function name that follows the SQL identifier syntax (can be quoted, can be qualified)</span> |
| <span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> column names or :class:`~pyspark.sql.Column`\\s to be used in the function</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> result of executed function.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.functions import call_udf, col</span> |
| <span class="sd"> >>> from pyspark.sql.types import IntegerType, StringType</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, "a"),(2, "b"), (3, "c")],["id", "name"])</span> |
| <span class="sd"> >>> _ = spark.udf.register("intX2", lambda i: i * 2, IntegerType())</span> |
| <span class="sd"> >>> df.select(call_function("intX2", "id")).show()</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> |intX2(id)|</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> | 4|</span> |
| <span class="sd"> | 6|</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> >>> _ = spark.udf.register("strX2", lambda s: s * 2, StringType())</span> |
| <span class="sd"> >>> df.select(call_function("strX2", col("name"))).show()</span> |
| <span class="sd"> +-----------+</span> |
| <span class="sd"> |strX2(name)|</span> |
| <span class="sd"> +-----------+</span> |
| <span class="sd"> | aa|</span> |
| <span class="sd"> | bb|</span> |
| <span class="sd"> | cc|</span> |
| <span class="sd"> +-----------+</span> |
| <span class="sd"> >>> df.select(call_function("avg", col("id"))).show()</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> |avg(id)|</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> | 2.0|</span> |
| <span class="sd"> +-------+</span> |
| <span class="sd"> >>> _ = spark.sql("CREATE FUNCTION custom_avg AS 'test.org.apache.spark.sql.MyDoubleAvg'")</span> |
| <span class="sd"> ... # doctest: +SKIP</span> |
| <span class="sd"> >>> df.select(call_function("custom_avg", col("id"))).show()</span> |
| <span class="sd"> ... # doctest: +SKIP</span> |
| <span class="sd"> +------------------------------------+</span> |
| <span class="sd"> |spark_catalog.default.custom_avg(id)|</span> |
| <span class="sd"> +------------------------------------+</span> |
| <span class="sd"> | 102.0|</span> |
| <span class="sd"> +------------------------------------+</span> |
| <span class="sd"> >>> df.select(call_function("spark_catalog.default.custom_avg", col("id"))).show()</span> |
| <span class="sd"> ... # doctest: +SKIP</span> |
| <span class="sd"> +------------------------------------+</span> |
| <span class="sd"> |spark_catalog.default.custom_avg(id)|</span> |
| <span class="sd"> +------------------------------------+</span> |
| <span class="sd"> | 102.0|</span> |
| <span class="sd"> +------------------------------------+</span> |
| <span class="sd"> """</span> |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">get_active_spark_context</span><span class="p">()</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"call_function"</span><span class="p">,</span> <span class="n">funcName</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="unwrap_udt"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.unwrap_udt.html#pyspark.sql.functions.unwrap_udt">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">unwrap_udt</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Unwrap UDT data type column into its underlying type.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"unwrap_udt"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="hll_sketch_agg"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.hll_sketch_agg.html#pyspark.sql.functions.hll_sketch_agg">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">hll_sketch_agg</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">lgConfigK</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Column</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the updatable binary representation of the Datasketches</span> |
| <span class="sd"> HllSketch configured with lgConfigK arg.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str or int</span> |
| <span class="sd"> lgConfigK : int, optional</span> |
| <span class="sd"> The log-base-2 of K, where K is the number of buckets or slots for the HllSketch</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> The binary representation of the HllSketch.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([1,2,2,3], "INT")</span> |
| <span class="sd"> >>> df1 = df.agg(hll_sketch_estimate(hll_sketch_agg("value")).alias("distinct_cnt"))</span> |
| <span class="sd"> >>> df1.show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |distinct_cnt|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> >>> df2 = df.agg(hll_sketch_estimate(</span> |
| <span class="sd"> ... hll_sketch_agg("value", lit(12))</span> |
| <span class="sd"> ... ).alias("distinct_cnt"))</span> |
| <span class="sd"> >>> df2.show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |distinct_cnt|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> >>> df3 = df.agg(hll_sketch_estimate(</span> |
| <span class="sd"> ... hll_sketch_agg(col("value"), lit(12))).alias("distinct_cnt"))</span> |
| <span class="sd"> >>> df3.show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |distinct_cnt|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">lgConfigK</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"hll_sketch_agg"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">_lgConfigK</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="n">lgConfigK</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">lgConfigK</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span> <span class="k">else</span> <span class="n">lgConfigK</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"hll_sketch_agg"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">_lgConfigK</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="hll_union_agg"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.hll_union_agg.html#pyspark.sql.functions.hll_union_agg">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">hll_union_agg</span><span class="p">(</span> |
| <span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">allowDifferentLgConfigK</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">bool</span><span class="p">,</span> <span class="n">Column</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Aggregate function: returns the updatable binary representation of the Datasketches</span> |
| <span class="sd"> HllSketch, generated by merging previously created Datasketches HllSketch instances</span> |
| <span class="sd"> via a Datasketches Union instance. Throws an exception if sketches have different</span> |
| <span class="sd"> lgConfigK values and allowDifferentLgConfigK is unset or set to false.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str or bool</span> |
| <span class="sd"> allowDifferentLgConfigK : bool, optional</span> |
| <span class="sd"> Allow sketches with different lgConfigK values to be merged (defaults to false).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> The binary representation of the merged HllSketch.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df1 = spark.createDataFrame([1,2,2,3], "INT")</span> |
| <span class="sd"> >>> df1 = df1.agg(hll_sketch_agg("value").alias("sketch"))</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([4,5,5,6], "INT")</span> |
| <span class="sd"> >>> df2 = df2.agg(hll_sketch_agg("value").alias("sketch"))</span> |
| <span class="sd"> >>> df3 = df1.union(df2).agg(hll_sketch_estimate(</span> |
| <span class="sd"> ... hll_union_agg("sketch")</span> |
| <span class="sd"> ... ).alias("distinct_cnt"))</span> |
| <span class="sd"> >>> df3.drop("sketch").show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |distinct_cnt|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | 6|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> >>> df4 = df1.union(df2).agg(hll_sketch_estimate(</span> |
| <span class="sd"> ... hll_union_agg("sketch", lit(False))</span> |
| <span class="sd"> ... ).alias("distinct_cnt"))</span> |
| <span class="sd"> >>> df4.drop("sketch").show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |distinct_cnt|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | 6|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> >>> df5 = df1.union(df2).agg(hll_sketch_estimate(</span> |
| <span class="sd"> ... hll_union_agg(col("sketch"), lit(False))</span> |
| <span class="sd"> ... ).alias("distinct_cnt"))</span> |
| <span class="sd"> >>> df5.drop("sketch").show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |distinct_cnt|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | 6|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">allowDifferentLgConfigK</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"hll_union_agg"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">_allowDifferentLgConfigK</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">lit</span><span class="p">(</span><span class="n">allowDifferentLgConfigK</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">allowDifferentLgConfigK</span><span class="p">,</span> <span class="nb">bool</span><span class="p">)</span> |
| <span class="k">else</span> <span class="n">allowDifferentLgConfigK</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"hll_union_agg"</span><span class="p">,</span> <span class="n">col</span><span class="p">,</span> <span class="n">_allowDifferentLgConfigK</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="hll_sketch_estimate"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.hll_sketch_estimate.html#pyspark.sql.functions.hll_sketch_estimate">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">hll_sketch_estimate</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the estimated number of unique values given the binary representation</span> |
| <span class="sd"> of a Datasketches HllSketch.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> The estimated number of unique values for the HllSketch.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([1,2,2,3], "INT")</span> |
| <span class="sd"> >>> df = df.agg(hll_sketch_estimate(hll_sketch_agg("value")).alias("distinct_cnt"))</span> |
| <span class="sd"> >>> df.show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |distinct_cnt|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"hll_sketch_estimate"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col</span><span class="p">))</span></div> |
| |
| |
| <div class="viewcode-block" id="hll_union"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.hll_union.html#pyspark.sql.functions.hll_union">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">hll_union</span><span class="p">(</span> |
| <span class="n">col1</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">allowDifferentLgConfigK</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Merges two binary representations of Datasketches HllSketch objects, using a</span> |
| <span class="sd"> Datasketches Union object. Throws an exception if sketches have different</span> |
| <span class="sd"> lgConfigK values and allowDifferentLgConfigK is unset or set to false.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> allowDifferentLgConfigK : bool, optional</span> |
| <span class="sd"> Allow sketches with different lgConfigK values to be merged (defaults to false).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`~pyspark.sql.Column`</span> |
| <span class="sd"> The binary representation of the merged HllSketch.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1,4),(2,5),(2,5),(3,6)], "struct<v1:int,v2:int>")</span> |
| <span class="sd"> >>> df = df.agg(hll_sketch_agg("v1").alias("sketch1"), hll_sketch_agg("v2").alias("sketch2"))</span> |
| <span class="sd"> >>> df = df.withColumn("distinct_cnt", hll_sketch_estimate(hll_union("sketch1", "sketch2")))</span> |
| <span class="sd"> >>> df.drop("sketch1", "sketch2").show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |distinct_cnt|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | 6|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">allowDifferentLgConfigK</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span> |
| <span class="s2">"hll_union"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col1</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col2</span><span class="p">),</span> <span class="n">allowDifferentLgConfigK</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_invoke_function</span><span class="p">(</span><span class="s2">"hll_union"</span><span class="p">,</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col1</span><span class="p">),</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">col2</span><span class="p">))</span></div> |
| |
| |
| <span class="c1"># ---------------------- Predicates functions ------------------------------</span> |
| |
| |
| <div class="viewcode-block" id="ifnull"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.ifnull.html#pyspark.sql.functions.ifnull">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">ifnull</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns `col2` if `col1` is null, or `col1` otherwise.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(None,), (1,)], ["e"])</span> |
| <span class="sd"> >>> df.select(sf.ifnull(df.e, sf.lit(8))).show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |ifnull(e, 8)|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | 8|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"ifnull"</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="isnotnull"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.isnotnull.html#pyspark.sql.functions.isnotnull">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">isnotnull</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns true if `col` is not null, or false otherwise.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(None,), (1,)], ["e"])</span> |
| <span class="sd"> >>> df.select(isnotnull(df.e).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=False), Row(r=True)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"isnotnull"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="equal_null"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.equal_null.html#pyspark.sql.functions.equal_null">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">equal_null</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns same result as the EQUAL(=) operator for non-null operands,</span> |
| <span class="sd"> but returns true if both are null, false if one of the them is null.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(None, None,), (1, 9,)], ["a", "b"])</span> |
| <span class="sd"> >>> df.select(equal_null(df.a, df.b).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=True), Row(r=False)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"equal_null"</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="nullif"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.nullif.html#pyspark.sql.functions.nullif">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">nullif</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns null if `col1` equals to `col2`, or `col1` otherwise.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(None, None,), (1, 9,)], ["a", "b"])</span> |
| <span class="sd"> >>> df.select(nullif(df.a, df.b).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=None), Row(r=1)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"nullif"</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="nvl"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.nvl.html#pyspark.sql.functions.nvl">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">nvl</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns `col2` if `col1` is null, or `col1` otherwise.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(None, 8,), (1, 9,)], ["a", "b"])</span> |
| <span class="sd"> >>> df.select(nvl(df.a, df.b).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=8), Row(r=1)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"nvl"</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="nvl2"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.nvl2.html#pyspark.sql.functions.nvl2">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">nvl2</span><span class="p">(</span><span class="n">col1</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">col3</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns `col2` if `col1` is not null, or `col3` otherwise.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> col2 : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> col3 : :class:`~pyspark.sql.Column` or str</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(None, 8, 6,), (1, 9, 9,)], ["a", "b", "c"])</span> |
| <span class="sd"> >>> df.select(nvl2(df.a, df.b, df.c).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=6), Row(r=9)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"nvl2"</span><span class="p">,</span> <span class="n">col1</span><span class="p">,</span> <span class="n">col2</span><span class="p">,</span> <span class="n">col3</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="aes_encrypt"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.aes_encrypt.html#pyspark.sql.functions.aes_encrypt">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">aes_encrypt</span><span class="p">(</span> |
| <span class="nb">input</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">key</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">mode</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">padding</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">iv</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">aad</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns an encrypted value of `input` using AES in given `mode` with the specified `padding`.</span> |
| <span class="sd"> Key lengths of 16, 24 and 32 bits are supported. Supported combinations of (`mode`,</span> |
| <span class="sd"> `padding`) are ('ECB', 'PKCS'), ('GCM', 'NONE') and ('CBC', 'PKCS'). Optional initialization</span> |
| <span class="sd"> vectors (IVs) are only supported for CBC and GCM modes. These must be 16 bytes for CBC and 12</span> |
| <span class="sd"> bytes for GCM. If not provided, a random vector will be generated and prepended to the</span> |
| <span class="sd"> output. Optional additional authenticated data (AAD) is only supported for GCM. If provided</span> |
| <span class="sd"> for encryption, the identical AAD value must be provided for decryption. The default mode is</span> |
| <span class="sd"> GCM.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> input : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The binary value to encrypt.</span> |
| <span class="sd"> key : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The passphrase to use to encrypt the data.</span> |
| <span class="sd"> mode : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> Specifies which block cipher mode should be used to encrypt messages. Valid modes: ECB,</span> |
| <span class="sd"> GCM, CBC.</span> |
| <span class="sd"> padding : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> Specifies how to pad messages whose length is not a multiple of the block size. Valid</span> |
| <span class="sd"> values: PKCS, NONE, DEFAULT. The DEFAULT padding means PKCS for ECB, NONE for GCM and PKCS</span> |
| <span class="sd"> for CBC.</span> |
| <span class="sd"> iv : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> Optional initialization vector. Only supported for CBC and GCM modes. Valid values: None or</span> |
| <span class="sd"> "". 16-byte array for CBC mode. 12-byte array for GCM mode.</span> |
| <span class="sd"> aad : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> Optional additional authenticated data. Only supported for GCM mode. This can be any</span> |
| <span class="sd"> free-form input and must be provided for both encryption and decryption.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(</span> |
| <span class="sd"> ... "Spark", "abcdefghijklmnop12345678ABCDEFGH", "GCM", "DEFAULT",</span> |
| <span class="sd"> ... "000000000000000000000000", "This is an AAD mixed into the input",)],</span> |
| <span class="sd"> ... ["input", "key", "mode", "padding", "iv", "aad"]</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.select(base64(aes_encrypt(</span> |
| <span class="sd"> ... df.input, df.key, df.mode, df.padding, to_binary(df.iv, lit("hex")), df.aad)</span> |
| <span class="sd"> ... ).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='AAAAAAAAAAAAAAAAQiYi+sTLm7KD9UcZ2nlRdYDe/PX4')]</span> |
| |
| <span class="sd"> >>> df.select(base64(aes_encrypt(</span> |
| <span class="sd"> ... df.input, df.key, df.mode, df.padding, to_binary(df.iv, lit("hex")))</span> |
| <span class="sd"> ... ).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='AAAAAAAAAAAAAAAAQiYi+sRNYDAOTjdSEcYBFsAWPL1f')]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(</span> |
| <span class="sd"> ... "Spark SQL", "1234567890abcdef", "ECB", "PKCS",)],</span> |
| <span class="sd"> ... ["input", "key", "mode", "padding"]</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.select(aes_decrypt(aes_encrypt(df.input, df.key, df.mode, df.padding),</span> |
| <span class="sd"> ... df.key, df.mode, df.padding).alias('r')</span> |
| <span class="sd"> ... ).collect()</span> |
| <span class="sd"> [Row(r=bytearray(b'Spark SQL'))]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(</span> |
| <span class="sd"> ... "Spark SQL", "0000111122223333", "ECB",)],</span> |
| <span class="sd"> ... ["input", "key", "mode"]</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.select(aes_decrypt(aes_encrypt(df.input, df.key, df.mode),</span> |
| <span class="sd"> ... df.key, df.mode).alias('r')</span> |
| <span class="sd"> ... ).collect()</span> |
| <span class="sd"> [Row(r=bytearray(b'Spark SQL'))]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(</span> |
| <span class="sd"> ... "Spark SQL", "abcdefghijklmnop",)],</span> |
| <span class="sd"> ... ["input", "key"]</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.select(aes_decrypt(</span> |
| <span class="sd"> ... unbase64(base64(aes_encrypt(df.input, df.key))), df.key</span> |
| <span class="sd"> ... ).cast("STRING").alias('r')).collect()</span> |
| <span class="sd"> [Row(r='Spark SQL')]</span> |
| <span class="sd"> """</span> |
| <span class="n">_mode</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">"GCM"</span><span class="p">)</span> <span class="k">if</span> <span class="n">mode</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">mode</span> |
| <span class="n">_padding</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">"DEFAULT"</span><span class="p">)</span> <span class="k">if</span> <span class="n">padding</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">padding</span> |
| <span class="n">_iv</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">""</span><span class="p">)</span> <span class="k">if</span> <span class="n">iv</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">iv</span> |
| <span class="n">_aad</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">""</span><span class="p">)</span> <span class="k">if</span> <span class="n">aad</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">aad</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"aes_encrypt"</span><span class="p">,</span> <span class="nb">input</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">_mode</span><span class="p">,</span> <span class="n">_padding</span><span class="p">,</span> <span class="n">_iv</span><span class="p">,</span> <span class="n">_aad</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="aes_decrypt"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.aes_decrypt.html#pyspark.sql.functions.aes_decrypt">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">aes_decrypt</span><span class="p">(</span> |
| <span class="nb">input</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">key</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">mode</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">padding</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">aad</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a decrypted value of `input` using AES in `mode` with `padding`. Key lengths of 16,</span> |
| <span class="sd"> 24 and 32 bits are supported. Supported combinations of (`mode`, `padding`) are ('ECB',</span> |
| <span class="sd"> 'PKCS'), ('GCM', 'NONE') and ('CBC', 'PKCS'). Optional additional authenticated data (AAD) is</span> |
| <span class="sd"> only supported for GCM. If provided for encryption, the identical AAD value must be provided</span> |
| <span class="sd"> for decryption. The default mode is GCM.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> input : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The binary value to decrypt.</span> |
| <span class="sd"> key : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The passphrase to use to decrypt the data.</span> |
| <span class="sd"> mode : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> Specifies which block cipher mode should be used to decrypt messages. Valid modes: ECB,</span> |
| <span class="sd"> GCM, CBC.</span> |
| <span class="sd"> padding : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> Specifies how to pad messages whose length is not a multiple of the block size. Valid</span> |
| <span class="sd"> values: PKCS, NONE, DEFAULT. The DEFAULT padding means PKCS for ECB, NONE for GCM and PKCS</span> |
| <span class="sd"> for CBC.</span> |
| <span class="sd"> aad : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> Optional additional authenticated data. Only supported for GCM mode. This can be any</span> |
| <span class="sd"> free-form input and must be provided for both encryption and decryption.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(</span> |
| <span class="sd"> ... "AAAAAAAAAAAAAAAAQiYi+sTLm7KD9UcZ2nlRdYDe/PX4",</span> |
| <span class="sd"> ... "abcdefghijklmnop12345678ABCDEFGH", "GCM", "DEFAULT",</span> |
| <span class="sd"> ... "This is an AAD mixed into the input",)],</span> |
| <span class="sd"> ... ["input", "key", "mode", "padding", "aad"]</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.select(aes_decrypt(</span> |
| <span class="sd"> ... unbase64(df.input), df.key, df.mode, df.padding, df.aad).alias('r')</span> |
| <span class="sd"> ... ).collect()</span> |
| <span class="sd"> [Row(r=bytearray(b'Spark'))]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(</span> |
| <span class="sd"> ... "AAAAAAAAAAAAAAAAAAAAAPSd4mWyMZ5mhvjiAPQJnfg=",</span> |
| <span class="sd"> ... "abcdefghijklmnop12345678ABCDEFGH", "CBC", "DEFAULT",)],</span> |
| <span class="sd"> ... ["input", "key", "mode", "padding"]</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.select(aes_decrypt(</span> |
| <span class="sd"> ... unbase64(df.input), df.key, df.mode, df.padding).alias('r')</span> |
| <span class="sd"> ... ).collect()</span> |
| <span class="sd"> [Row(r=bytearray(b'Spark'))]</span> |
| |
| <span class="sd"> >>> df.select(aes_decrypt(unbase64(df.input), df.key, df.mode).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=bytearray(b'Spark'))]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(</span> |
| <span class="sd"> ... "83F16B2AA704794132802D248E6BFD4E380078182D1544813898AC97E709B28A94",</span> |
| <span class="sd"> ... "0000111122223333",)],</span> |
| <span class="sd"> ... ["input", "key"]</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.select(aes_decrypt(unhex(df.input), df.key).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=bytearray(b'Spark'))]</span> |
| <span class="sd"> """</span> |
| <span class="n">_mode</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">"GCM"</span><span class="p">)</span> <span class="k">if</span> <span class="n">mode</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">mode</span> |
| <span class="n">_padding</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">"DEFAULT"</span><span class="p">)</span> <span class="k">if</span> <span class="n">padding</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">padding</span> |
| <span class="n">_aad</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">""</span><span class="p">)</span> <span class="k">if</span> <span class="n">aad</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">aad</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"aes_decrypt"</span><span class="p">,</span> <span class="nb">input</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">_mode</span><span class="p">,</span> <span class="n">_padding</span><span class="p">,</span> <span class="n">_aad</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="try_aes_decrypt"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.try_aes_decrypt.html#pyspark.sql.functions.try_aes_decrypt">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">try_aes_decrypt</span><span class="p">(</span> |
| <span class="nb">input</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">key</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> |
| <span class="n">mode</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">padding</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">aad</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> This is a special version of `aes_decrypt` that performs the same operation,</span> |
| <span class="sd"> but returns a NULL value instead of raising an error if the decryption cannot be performed.</span> |
| <span class="sd"> Returns a decrypted value of `input` using AES in `mode` with `padding`. Key lengths of 16,</span> |
| <span class="sd"> 24 and 32 bits are supported. Supported combinations of (`mode`, `padding`) are ('ECB',</span> |
| <span class="sd"> 'PKCS'), ('GCM', 'NONE') and ('CBC', 'PKCS'). Optional additional authenticated data (AAD) is</span> |
| <span class="sd"> only supported for GCM. If provided for encryption, the identical AAD value must be provided</span> |
| <span class="sd"> for decryption. The default mode is GCM.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> input : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The binary value to decrypt.</span> |
| <span class="sd"> key : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The passphrase to use to decrypt the data.</span> |
| <span class="sd"> mode : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> Specifies which block cipher mode should be used to decrypt messages. Valid modes: ECB,</span> |
| <span class="sd"> GCM, CBC.</span> |
| <span class="sd"> padding : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> Specifies how to pad messages whose length is not a multiple of the block size. Valid</span> |
| <span class="sd"> values: PKCS, NONE, DEFAULT. The DEFAULT padding means PKCS for ECB, NONE for GCM and PKCS</span> |
| <span class="sd"> for CBC.</span> |
| <span class="sd"> aad : :class:`~pyspark.sql.Column` or str, optional</span> |
| <span class="sd"> Optional additional authenticated data. Only supported for GCM mode. This can be any</span> |
| <span class="sd"> free-form input and must be provided for both encryption and decryption.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(</span> |
| <span class="sd"> ... "AAAAAAAAAAAAAAAAQiYi+sTLm7KD9UcZ2nlRdYDe/PX4",</span> |
| <span class="sd"> ... "abcdefghijklmnop12345678ABCDEFGH", "GCM", "DEFAULT",</span> |
| <span class="sd"> ... "This is an AAD mixed into the input",)],</span> |
| <span class="sd"> ... ["input", "key", "mode", "padding", "aad"]</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.select(try_aes_decrypt(</span> |
| <span class="sd"> ... unbase64(df.input), df.key, df.mode, df.padding, df.aad).alias('r')</span> |
| <span class="sd"> ... ).collect()</span> |
| <span class="sd"> [Row(r=bytearray(b'Spark'))]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(</span> |
| <span class="sd"> ... "AAAAAAAAAAAAAAAAAAAAAPSd4mWyMZ5mhvjiAPQJnfg=",</span> |
| <span class="sd"> ... "abcdefghijklmnop12345678ABCDEFGH", "CBC", "DEFAULT",)],</span> |
| <span class="sd"> ... ["input", "key", "mode", "padding"]</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.select(try_aes_decrypt(</span> |
| <span class="sd"> ... unbase64(df.input), df.key, df.mode, df.padding).alias('r')</span> |
| <span class="sd"> ... ).collect()</span> |
| <span class="sd"> [Row(r=bytearray(b'Spark'))]</span> |
| |
| <span class="sd"> >>> df.select(try_aes_decrypt(unbase64(df.input), df.key, df.mode).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=bytearray(b'Spark'))]</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(</span> |
| <span class="sd"> ... "83F16B2AA704794132802D248E6BFD4E380078182D1544813898AC97E709B28A94",</span> |
| <span class="sd"> ... "0000111122223333",)],</span> |
| <span class="sd"> ... ["input", "key"]</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.select(try_aes_decrypt(unhex(df.input), df.key).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=bytearray(b'Spark'))]</span> |
| <span class="sd"> """</span> |
| <span class="n">_mode</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">"GCM"</span><span class="p">)</span> <span class="k">if</span> <span class="n">mode</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">mode</span> |
| <span class="n">_padding</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">"DEFAULT"</span><span class="p">)</span> <span class="k">if</span> <span class="n">padding</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">padding</span> |
| <span class="n">_aad</span> <span class="o">=</span> <span class="n">lit</span><span class="p">(</span><span class="s2">""</span><span class="p">)</span> <span class="k">if</span> <span class="n">aad</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">aad</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"try_aes_decrypt"</span><span class="p">,</span> <span class="nb">input</span><span class="p">,</span> <span class="n">key</span><span class="p">,</span> <span class="n">_mode</span><span class="p">,</span> <span class="n">_padding</span><span class="p">,</span> <span class="n">_aad</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="sha"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.sha.html#pyspark.sql.functions.sha">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">sha</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a sha1 hash value as a hex string of the `col`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.range(1).select(sf.sha(sf.lit("Spark"))).show()</span> |
| <span class="sd"> +--------------------+</span> |
| <span class="sd"> | sha(Spark)|</span> |
| <span class="sd"> +--------------------+</span> |
| <span class="sd"> |85f5955f4b27a9a4c...|</span> |
| <span class="sd"> +--------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"sha"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="input_file_block_length"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.input_file_block_length.html#pyspark.sql.functions.input_file_block_length">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">input_file_block_length</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the length of the block being read, or -1 if not available.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.read.text("python/test_support/sql/ages_newlines.csv", lineSep=",")</span> |
| <span class="sd"> >>> df.select(input_file_block_length().alias('r')).first()</span> |
| <span class="sd"> Row(r=87)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"input_file_block_length"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="input_file_block_start"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.input_file_block_start.html#pyspark.sql.functions.input_file_block_start">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">input_file_block_start</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the start offset of the block being read, or -1 if not available.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.read.text("python/test_support/sql/ages_newlines.csv", lineSep=",")</span> |
| <span class="sd"> >>> df.select(input_file_block_start().alias('r')).first()</span> |
| <span class="sd"> Row(r=0)</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"input_file_block_start"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="reflect"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.reflect.html#pyspark.sql.functions.reflect">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">reflect</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Calls a method with reflection.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the first element should be a literal string for the class name,</span> |
| <span class="sd"> and the second element should be a literal string for the method name,</span> |
| <span class="sd"> and the remaining are input arguments to the Java method.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("a5cf6c42-0c85-418f-af6c-3e4e5b1328f2",)], ["a"])</span> |
| <span class="sd"> >>> df.select(</span> |
| <span class="sd"> ... reflect(lit("java.util.UUID"), lit("fromString"), df.a).alias('r')</span> |
| <span class="sd"> ... ).collect()</span> |
| <span class="sd"> [Row(r='a5cf6c42-0c85-418f-af6c-3e4e5b1328f2')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">"reflect"</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="java_method"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.java_method.html#pyspark.sql.functions.java_method">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">java_method</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Calls a method with reflection.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the first element should be a literal string for the class name,</span> |
| <span class="sd"> and the second element should be a literal string for the method name,</span> |
| <span class="sd"> and the remaining are input arguments to the Java method.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> spark.range(1).select(</span> |
| <span class="sd"> ... sf.java_method(</span> |
| <span class="sd"> ... sf.lit("java.util.UUID"),</span> |
| <span class="sd"> ... sf.lit("fromString"),</span> |
| <span class="sd"> ... sf.lit("a5cf6c42-0c85-418f-af6c-3e4e5b1328f2")</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> ... ).show(truncate=False)</span> |
| <span class="sd"> +-----------------------------------------------------------------------------+</span> |
| <span class="sd"> |java_method(java.util.UUID, fromString, a5cf6c42-0c85-418f-af6c-3e4e5b1328f2)|</span> |
| <span class="sd"> +-----------------------------------------------------------------------------+</span> |
| <span class="sd"> |a5cf6c42-0c85-418f-af6c-3e4e5b1328f2 |</span> |
| <span class="sd"> +-----------------------------------------------------------------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">"java_method"</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="version"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.version.html#pyspark.sql.functions.version">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">version</span><span class="p">()</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the Spark version. The string contains 2 fields, the first being a release version</span> |
| <span class="sd"> and the second being a git revision.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.select(version()).show(truncate=False) # doctest: +SKIP</span> |
| <span class="sd"> +----------------------------------------------+</span> |
| <span class="sd"> |version() |</span> |
| <span class="sd"> +----------------------------------------------+</span> |
| <span class="sd"> |3.5.0 cafbea5b13623276517a9d716f75745eff91f616|</span> |
| <span class="sd"> +----------------------------------------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"version"</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="typeof"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.typeof.html#pyspark.sql.functions.typeof">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">typeof</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return DDL-formatted type string for the data type of the input.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1,)], ["a"])</span> |
| <span class="sd"> >>> df.select(typeof(df.a).alias('r')).collect()</span> |
| <span class="sd"> [Row(r='bigint')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"typeof"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="stack"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.stack.html#pyspark.sql.functions.stack">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">stack</span><span class="p">(</span><span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Separates `col1`, ..., `colk` into `n` rows. Uses column names col0, col1, etc. by default</span> |
| <span class="sd"> unless specified otherwise.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> the first element should be a literal int for the number of rows to be separated,</span> |
| <span class="sd"> and the remaining are input elements to be separated.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, 2, 3)], ["a", "b", "c"])</span> |
| <span class="sd"> >>> df.select(stack(lit(2), df.a, df.b, df.c)).show(truncate=False)</span> |
| <span class="sd"> +----+----+</span> |
| <span class="sd"> |col0|col1|</span> |
| <span class="sd"> +----+----+</span> |
| <span class="sd"> |1 |2 |</span> |
| <span class="sd"> |3 |NULL|</span> |
| <span class="sd"> +----+----+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_seq_of_columns</span><span class="p">(</span><span class="s2">"stack"</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="bitmap_bit_position"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bitmap_bit_position.html#pyspark.sql.functions.bitmap_bit_position">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">bitmap_bit_position</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the bit position for the given input column.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The input column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(123,)], ["a"])</span> |
| <span class="sd"> >>> df.select(bitmap_bit_position(df.a).alias("r")).collect()</span> |
| <span class="sd"> [Row(r=122)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"bitmap_bit_position"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="bitmap_bucket_number"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bitmap_bucket_number.html#pyspark.sql.functions.bitmap_bucket_number">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">bitmap_bucket_number</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the bucket number for the given input column.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The input column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(123,)], ["a"])</span> |
| <span class="sd"> >>> df.select(bitmap_bucket_number(df.a).alias("r")).collect()</span> |
| <span class="sd"> [Row(r=1)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"bitmap_bucket_number"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="bitmap_construct_agg"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bitmap_construct_agg.html#pyspark.sql.functions.bitmap_construct_agg">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">bitmap_construct_agg</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a bitmap with the positions of the bits set from all the values from the input column.</span> |
| <span class="sd"> The input column will most likely be bitmap_bit_position().</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The input column will most likely be bitmap_bit_position().</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1,),(2,),(3,)], ["a"])</span> |
| <span class="sd"> >>> df.select(substring(hex(</span> |
| <span class="sd"> ... bitmap_construct_agg(bitmap_bit_position(df.a))</span> |
| <span class="sd"> ... ), 0, 6).alias("r")).collect()</span> |
| <span class="sd"> [Row(r='070000')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"bitmap_construct_agg"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="bitmap_count"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bitmap_count.html#pyspark.sql.functions.bitmap_count">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">bitmap_count</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the number of set bits in the input bitmap.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The input bitmap.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("FFFF",)], ["a"])</span> |
| <span class="sd"> >>> df.select(bitmap_count(to_binary(df.a, lit("hex"))).alias('r')).collect()</span> |
| <span class="sd"> [Row(r=16)]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"bitmap_count"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="bitmap_or_agg"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.bitmap_or_agg.html#pyspark.sql.functions.bitmap_or_agg">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">bitmap_or_agg</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a bitmap that is the bitwise OR of all of the bitmaps from the input column.</span> |
| <span class="sd"> The input column should be bitmaps created from bitmap_construct_agg().</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`~pyspark.sql.Column` or str</span> |
| <span class="sd"> The input column should be bitmaps created from bitmap_construct_agg().</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("10",),("20",),("40",)], ["a"])</span> |
| <span class="sd"> >>> df.select(substring(hex(</span> |
| <span class="sd"> ... bitmap_or_agg(to_binary(df.a, lit("hex")))</span> |
| <span class="sd"> ... ), 0, 6).alias("r")).collect()</span> |
| <span class="sd"> [Row(r='700000')]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">_invoke_function_over_columns</span><span class="p">(</span><span class="s2">"bitmap_or_agg"</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span></div> |
| |
| |
| <span class="c1"># ---------------------------- User Defined Function ----------------------------------</span> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">udf</span><span class="p">(</span> |
| <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> |
| <span class="n">returnType</span><span class="p">:</span> <span class="s2">"DataTypeOrString"</span> <span class="o">=</span> <span class="n">StringType</span><span class="p">(),</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">useArrow</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"UserDefinedFunctionLike"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">udf</span><span class="p">(</span> |
| <span class="n">f</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"DataTypeOrString"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">useArrow</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span> <span class="s2">"UserDefinedFunctionLike"</span><span class="p">]:</span> |
| <span class="o">...</span> |
| |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">udf</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">returnType</span><span class="p">:</span> <span class="s2">"DataTypeOrString"</span> <span class="o">=</span> <span class="n">StringType</span><span class="p">(),</span> |
| <span class="n">useArrow</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span> <span class="s2">"UserDefinedFunctionLike"</span><span class="p">]:</span> |
| <span class="o">...</span> |
| |
| |
| <div class="viewcode-block" id="udf"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.udf.html#pyspark.sql.functions.udf">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">udf</span><span class="p">(</span> |
| <span class="n">f</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> <span class="s2">"DataTypeOrString"</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">returnType</span><span class="p">:</span> <span class="s2">"DataTypeOrString"</span> <span class="o">=</span> <span class="n">StringType</span><span class="p">(),</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">useArrow</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="s2">"UserDefinedFunctionLike"</span><span class="p">,</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Any</span><span class="p">]],</span> <span class="s2">"UserDefinedFunctionLike"</span><span class="p">]]:</span> |
| <span class="w"> </span><span class="sd">"""Creates a user defined function (UDF).</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> f : function</span> |
| <span class="sd"> python function if used as a standalone function</span> |
| <span class="sd"> returnType : :class:`pyspark.sql.types.DataType` or str</span> |
| <span class="sd"> the return type of the user-defined function. The value can be either a</span> |
| <span class="sd"> :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.</span> |
| <span class="sd"> useArrow : bool or None</span> |
| <span class="sd"> whether to use Arrow to optimize the (de)serialization. When it is None, the</span> |
| <span class="sd"> Spark config "spark.sql.execution.pythonUDF.arrow.enabled" takes effect.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.types import IntegerType</span> |
| <span class="sd"> >>> slen = udf(lambda s: len(s), IntegerType())</span> |
| <span class="sd"> >>> @udf</span> |
| <span class="sd"> ... def to_upper(s):</span> |
| <span class="sd"> ... if s is not None:</span> |
| <span class="sd"> ... return s.upper()</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> @udf(returnType=IntegerType())</span> |
| <span class="sd"> ... def add_one(x):</span> |
| <span class="sd"> ... if x is not None:</span> |
| <span class="sd"> ... return x + 1</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age"))</span> |
| <span class="sd"> >>> df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age")).show()</span> |
| <span class="sd"> +----------+--------------+------------+</span> |
| <span class="sd"> |slen(name)|to_upper(name)|add_one(age)|</span> |
| <span class="sd"> +----------+--------------+------------+</span> |
| <span class="sd"> | 8| JOHN DOE| 22|</span> |
| <span class="sd"> +----------+--------------+------------+</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The user-defined functions are considered deterministic by default. Due to</span> |
| <span class="sd"> optimization, duplicate invocations may be eliminated or the function may even be invoked</span> |
| <span class="sd"> more times than it is present in the query. If your function is not deterministic, call</span> |
| <span class="sd"> `asNondeterministic` on the user defined function. E.g.:</span> |
| |
| <span class="sd"> >>> from pyspark.sql.types import IntegerType</span> |
| <span class="sd"> >>> import random</span> |
| <span class="sd"> >>> random_udf = udf(lambda: int(random.random() * 100), IntegerType()).asNondeterministic()</span> |
| |
| <span class="sd"> The user-defined functions do not support conditional expressions or short circuiting</span> |
| <span class="sd"> in boolean expressions and it ends up with being executed all internally. If the functions</span> |
| <span class="sd"> can fail on special rows, the workaround is to incorporate the condition into the functions.</span> |
| |
| <span class="sd"> The user-defined functions do not take keyword arguments on the calling side.</span> |
| <span class="sd"> """</span> |
| |
| <span class="c1"># The following table shows most of Python data and SQL type conversions in normal UDFs that</span> |
| <span class="c1"># are not yet visible to the user. Some of behaviors are buggy and might be changed in the near</span> |
| <span class="c1"># future. The table might have to be eventually documented externally.</span> |
| <span class="c1"># Please see SPARK-28131's PR to see the codes in order to generate the table below.</span> |
| <span class="c1">#</span> |
| <span class="c1"># +-----------------------------+--------------+----------+------+---------------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+----------------------------+------------+--------------+------------------+----------------------+ # noqa</span> |
| <span class="c1"># |SQL Type \ Python Value(Type)|None(NoneType)|True(bool)|1(int)| a(str)| 1970-01-01(date)|1970-01-01 00:00:00(datetime)|1.0(float)|array('i', [1])(array)|[1](list)| (1,)(tuple)|bytearray(b'ABC')(bytearray)| 1(Decimal)|{'a': 1}(dict)|Row(kwargs=1)(Row)|Row(namedtuple=1)(Row)| # noqa</span> |
| <span class="c1"># +-----------------------------+--------------+----------+------+---------------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+----------------------------+------------+--------------+------------------+----------------------+ # noqa</span> |
| <span class="c1"># | boolean| None| True| None| None| None| None| None| None| None| None| None| None| None| X| X| # noqa</span> |
| <span class="c1"># | tinyint| None| None| 1| None| None| None| None| None| None| None| None| None| None| X| X| # noqa</span> |
| <span class="c1"># | smallint| None| None| 1| None| None| None| None| None| None| None| None| None| None| X| X| # noqa</span> |
| <span class="c1"># | int| None| None| 1| None| None| None| None| None| None| None| None| None| None| X| X| # noqa</span> |
| <span class="c1"># | bigint| None| None| 1| None| None| None| None| None| None| None| None| None| None| X| X| # noqa</span> |
| <span class="c1"># | string| None| 'true'| '1'| 'a'|'java.util.Gregor...| 'java.util.Gregor...| '1.0'| '[I@66cbb73a'| '[1]'|'[Ljava.lang.Obje...| '[B@5a51eb1a'| '1'| '{a=1}'| X| X| # noqa</span> |
| <span class="c1"># | date| None| X| X| X|datetime.date(197...| datetime.date(197...| X| X| X| X| X| X| X| X| X| # noqa</span> |
| <span class="c1"># | timestamp| None| X| X| X| X| datetime.datetime...| X| X| X| X| X| X| X| X| X| # noqa</span> |
| <span class="c1"># | float| None| None| None| None| None| None| 1.0| None| None| None| None| None| None| X| X| # noqa</span> |
| <span class="c1"># | double| None| None| None| None| None| None| 1.0| None| None| None| None| None| None| X| X| # noqa</span> |
| <span class="c1"># | array<int>| None| None| None| None| None| None| None| [1]| [1]| [1]| [65, 66, 67]| None| None| X| X| # noqa</span> |
| <span class="c1"># | binary| None| None| None|bytearray(b'a')| None| None| None| None| None| None| bytearray(b'ABC')| None| None| X| X| # noqa</span> |
| <span class="c1"># | decimal(10,0)| None| None| None| None| None| None| None| None| None| None| None|Decimal('1')| None| X| X| # noqa</span> |
| <span class="c1"># | map<string,int>| None| None| None| None| None| None| None| None| None| None| None| None| {'a': 1}| X| X| # noqa</span> |
| <span class="c1"># | struct<_1:int>| None| X| X| X| X| X| X| X|Row(_1=1)| Row(_1=1)| X| X| Row(_1=None)| Row(_1=1)| Row(_1=1)| # noqa</span> |
| <span class="c1"># +-----------------------------+--------------+----------+------+---------------+--------------------+-----------------------------+----------+----------------------+---------+--------------------+----------------------------+------------+--------------+------------------+----------------------+ # noqa</span> |
| <span class="c1">#</span> |
| <span class="c1"># Note: DDL formatted string is used for 'SQL Type' for simplicity. This string can be</span> |
| <span class="c1"># used in `returnType`.</span> |
| <span class="c1"># Note: The values inside of the table are generated by `repr`.</span> |
| <span class="c1"># Note: 'X' means it throws an exception during the conversion.</span> |
| |
| <span class="c1"># decorator @udf, @udf(), @udf(dataType())</span> |
| <span class="k">if</span> <span class="n">f</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">DataType</span><span class="p">)):</span> |
| <span class="c1"># If DataType has been passed as a positional argument</span> |
| <span class="c1"># for decorator use it as a returnType</span> |
| <span class="n">return_type</span> <span class="o">=</span> <span class="n">f</span> <span class="ow">or</span> <span class="n">returnType</span> |
| <span class="k">return</span> <span class="n">functools</span><span class="o">.</span><span class="n">partial</span><span class="p">(</span> |
| <span class="n">_create_py_udf</span><span class="p">,</span> |
| <span class="n">returnType</span><span class="o">=</span><span class="n">return_type</span><span class="p">,</span> |
| <span class="n">useArrow</span><span class="o">=</span><span class="n">useArrow</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_create_py_udf</span><span class="p">(</span><span class="n">f</span><span class="o">=</span><span class="n">f</span><span class="p">,</span> <span class="n">returnType</span><span class="o">=</span><span class="n">returnType</span><span class="p">,</span> <span class="n">useArrow</span><span class="o">=</span><span class="n">useArrow</span><span class="p">)</span></div> |
| |
| |
| <div class="viewcode-block" id="udtf"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.functions.udtf.html#pyspark.sql.functions.udtf">[docs]</a><span class="nd">@try_remote_functions</span> |
| <span class="k">def</span> <span class="nf">udtf</span><span class="p">(</span> |
| <span class="bp">cls</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Type</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">returnType</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">StructType</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> |
| <span class="n">useArrow</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="s2">"UserDefinedTableFunction"</span><span class="p">,</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Type</span><span class="p">],</span> <span class="s2">"UserDefinedTableFunction"</span><span class="p">]]:</span> |
| <span class="w"> </span><span class="sd">"""Creates a user defined table function (UDTF).</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cls : class</span> |
| <span class="sd"> the Python user-defined table function handler class.</span> |
| <span class="sd"> returnType : :class:`pyspark.sql.types.StructType` or str</span> |
| <span class="sd"> the return type of the user-defined table function. The value can be either a</span> |
| <span class="sd"> :class:`pyspark.sql.types.StructType` object or a DDL-formatted struct type string.</span> |
| <span class="sd"> useArrow : bool or None, optional</span> |
| <span class="sd"> whether to use Arrow to optimize the (de)serializations. When it's set to None, the</span> |
| <span class="sd"> Spark config "spark.sql.execution.pythonUDTF.arrow.enabled" is used.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Implement the UDTF class and create a UDTF:</span> |
| |
| <span class="sd"> >>> class TestUDTF:</span> |
| <span class="sd"> ... def eval(self, *args: Any):</span> |
| <span class="sd"> ... yield "hello", "world"</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> from pyspark.sql.functions import udtf</span> |
| <span class="sd"> >>> test_udtf = udtf(TestUDTF, returnType="c1: string, c2: string")</span> |
| <span class="sd"> >>> test_udtf().show()</span> |
| <span class="sd"> +-----+-----+</span> |
| <span class="sd"> | c1| c2|</span> |
| <span class="sd"> +-----+-----+</span> |
| <span class="sd"> |hello|world|</span> |
| <span class="sd"> +-----+-----+</span> |
| |
| <span class="sd"> UDTF can also be created using the decorator syntax:</span> |
| |
| <span class="sd"> >>> @udtf(returnType="c1: int, c2: int")</span> |
| <span class="sd"> ... class PlusOne:</span> |
| <span class="sd"> ... def eval(self, x: int):</span> |
| <span class="sd"> ... yield x, x + 1</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> from pyspark.sql.functions import lit</span> |
| <span class="sd"> >>> PlusOne(lit(1)).show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | c1| c2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | 1| 2|</span> |
| <span class="sd"> +---+---+</span> |
| |
| <span class="sd"> Arrow optimization can be explicitly enabled when creating UDTFs:</span> |
| |
| <span class="sd"> >>> @udtf(returnType="c1: int, c2: int", useArrow=True)</span> |
| <span class="sd"> ... class ArrowPlusOne:</span> |
| <span class="sd"> ... def eval(self, x: int):</span> |
| <span class="sd"> ... yield x, x + 1</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> ArrowPlusOne(lit(1)).show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | c1| c2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | 1| 2|</span> |
| <span class="sd"> +---+---+</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> User-defined table functions (UDTFs) are considered non-deterministic by default.</span> |
| <span class="sd"> Use `asDeterministic()` to mark a function as deterministic. E.g.:</span> |
| |
| <span class="sd"> >>> class PlusOne:</span> |
| <span class="sd"> ... def eval(self, a: int):</span> |
| <span class="sd"> ... yield a + 1,</span> |
| <span class="sd"> >>> plus_one = udtf(PlusOne, returnType="r: int").asDeterministic()</span> |
| |
| <span class="sd"> Use "yield" to produce one row for the UDTF result relation as many times</span> |
| <span class="sd"> as needed. In the context of a lateral join, each such result row will be</span> |
| <span class="sd"> associated with the most recent input row consumed from the "eval" method.</span> |
| |
| <span class="sd"> User-defined table functions are considered opaque to the optimizer by default.</span> |
| <span class="sd"> As a result, operations like filters from WHERE clauses or limits from</span> |
| <span class="sd"> LIMIT/OFFSET clauses that appear after the UDTF call will execute on the</span> |
| <span class="sd"> UDTF's result relation. By the same token, any relations forwarded as input</span> |
| <span class="sd"> to UDTFs will plan as full table scans in the absence of any explicit such</span> |
| <span class="sd"> filtering or other logic explicitly written in a table subquery surrounding the</span> |
| <span class="sd"> provided input relation.</span> |
| |
| <span class="sd"> User-defined table functions do not accept keyword arguments on the calling side.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="bp">cls</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">functools</span><span class="o">.</span><span class="n">partial</span><span class="p">(</span><span class="n">_create_py_udtf</span><span class="p">,</span> <span class="n">returnType</span><span class="o">=</span><span class="n">returnType</span><span class="p">,</span> <span class="n">useArrow</span><span class="o">=</span><span class="n">useArrow</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">_create_py_udtf</span><span class="p">(</span><span class="bp">cls</span><span class="o">=</span><span class="bp">cls</span><span class="p">,</span> <span class="n">returnType</span><span class="o">=</span><span class="n">returnType</span><span class="p">,</span> <span class="n">useArrow</span><span class="o">=</span><span class="n">useArrow</span><span class="p">)</span></div> |
| |
| |
| <span class="k">def</span> <span class="nf">_test</span><span class="p">()</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="kn">import</span> <span class="nn">doctest</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| <span class="kn">import</span> <span class="nn">pyspark.sql.functions</span> |
| |
| <span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">functions</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">"local[4]"</span><span class="p">)</span><span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">"sql.functions tests"</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">sparkContext</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"sc"</span><span class="p">]</span> <span class="o">=</span> <span class="n">sc</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"spark"</span><span class="p">]</span> <span class="o">=</span> <span class="n">spark</span> |
| <span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span> |
| <span class="n">pyspark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">functions</span><span class="p">,</span> |
| <span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span> |
| <span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span> |
| <span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> |
| |
| |
| <span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span> |
| <span class="n">_test</span><span class="p">()</span> |
| </pre></div> |
| |
| </div> |
| |
| |
| <!-- Previous / next buttons --> |
| <div class='prev-next-area'> |
| </div> |
| |
| </main> |
| |
| |
| </div> |
| </div> |
| |
| <script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf"></script> |
| <footer class="footer mt-5 mt-md-0"> |
| <div class="container"> |
| |
| <div class="footer-item"> |
| <p class="copyright"> |
| © Copyright .<br> |
| </p> |
| </div> |
| |
| <div class="footer-item"> |
| <p class="sphinx-version"> |
| Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br> |
| </p> |
| </div> |
| |
| </div> |
| </footer> |
| </body> |
| </html> |