blob: a29f23ad6588d6e8cd224ed3af8d0b7fec455a8a [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title>pyspark.pandas.base &#8212; PySpark 3.3.0 documentation</title>
<link rel="stylesheet" href="../../../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css">
<link rel="stylesheet"
href="../../../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet"
href="../../../_static/vendor/open-sans_all/1.44.1/index.css">
<link rel="stylesheet"
href="../../../_static/vendor/lato_latin-ext/1.44.1/index.css">
<link rel="stylesheet" href="../../../_static/basic.css" type="text/css" />
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" />
<link rel="preload" as="script" href="../../../_static/js/index.3da636dd464baa7582d2.js">
<script id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/jquery.js"></script>
<script src="../../../_static/underscore.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/language_data.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script>
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="en" />
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main">
<div class="container-xl">
<a class="navbar-brand" href="../../../index.html">
<img src="../../../_static/spark-logo-reverse.png" class="logo" alt="logo" />
</a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar-menu" class="col-lg-9 collapse navbar-collapse">
<ul id="navbar-main-elements" class="navbar-nav mr-auto">
<li class="nav-item ">
<a class="nav-link" href="../../../getting_started/index.html">Getting Started</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../user_guide/index.html">User Guide</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../reference/index.html">API Reference</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../development/index.html">Development</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../migration_guide/index.html">Migration Guide</a>
</li>
</ul>
<ul class="navbar-nav">
</ul>
</div>
</div>
</nav>
<div class="container-xl">
<div class="row">
<div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../../../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form>
<nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
<ul class="nav bd-sidenav">
</ul>
</nav>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
<nav id="bd-toc-nav">
<ul class="nav section-nav flex-column">
</ul>
</nav>
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<h1>Source code for pyspark.pandas.base</h1><div class="highlight"><pre>
<span></span><span class="c1">#</span>
<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c1"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c1"># this work for additional information regarding copyright ownership.</span>
<span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c1"># (the &quot;License&quot;); you may not use this file except in compliance with</span>
<span class="c1"># the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c1"># distributed under the License is distributed on an &quot;AS IS&quot; BASIS,</span>
<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="c1">#</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd">Base and utility classes for pandas-on-Spark objects.</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">abc</span> <span class="kn">import</span> <span class="n">ABCMeta</span><span class="p">,</span> <span class="n">abstractmethod</span>
<span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">wraps</span><span class="p">,</span> <span class="n">partial</span>
<span class="kn">from</span> <span class="nn">itertools</span> <span class="kn">import</span> <span class="n">chain</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Any</span><span class="p">,</span> <span class="n">Callable</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">,</span> <span class="n">Union</span><span class="p">,</span> <span class="n">cast</span><span class="p">,</span> <span class="n">TYPE_CHECKING</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<span class="kn">from</span> <span class="nn">pandas.api.types</span> <span class="kn">import</span> <span class="n">is_list_like</span><span class="p">,</span> <span class="n">CategoricalDtype</span> <span class="c1"># type: ignore[attr-defined]</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">F</span><span class="p">,</span> <span class="n">Column</span><span class="p">,</span> <span class="n">Window</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">LongType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">,</span> <span class="n">NumericType</span>
<span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">pandas</span> <span class="k">as</span> <span class="n">ps</span> <span class="c1"># For running doctests and reference resolution in PyCharm.</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas._typing</span> <span class="kn">import</span> <span class="n">Axis</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">,</span> <span class="n">IndexOpsLike</span><span class="p">,</span> <span class="n">Label</span><span class="p">,</span> <span class="n">SeriesOrIndex</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.config</span> <span class="kn">import</span> <span class="n">get_option</span><span class="p">,</span> <span class="n">option_context</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.internal</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">InternalField</span><span class="p">,</span>
<span class="n">InternalFrame</span><span class="p">,</span>
<span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span>
<span class="n">SPARK_DEFAULT_INDEX_NAME</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.spark</span> <span class="kn">import</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">SF</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.spark.accessors</span> <span class="kn">import</span> <span class="n">SparkIndexOpsMethods</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.typedef</span> <span class="kn">import</span> <span class="n">extension_dtypes</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.utils</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">combine_frames</span><span class="p">,</span>
<span class="n">same_anchor</span><span class="p">,</span>
<span class="n">scol_for</span><span class="p">,</span>
<span class="n">validate_axis</span><span class="p">,</span>
<span class="n">ERROR_MESSAGE_CANNOT_COMBINE</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.frame</span> <span class="kn">import</span> <span class="n">DataFrame</span>
<span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.sql._typing</span> <span class="kn">import</span> <span class="n">ColumnOrName</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.data_type_ops.base</span> <span class="kn">import</span> <span class="n">DataTypeOps</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span>
<span class="k">def</span> <span class="nf">should_alignment_for_column_op</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">SeriesOrIndex</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">SeriesOrIndex</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="k">return</span> <span class="ow">not</span> <span class="n">same_anchor</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span> <span class="ow">is</span> <span class="ow">not</span> <span class="n">other</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span>
<span class="k">def</span> <span class="nf">align_diff_index_ops</span><span class="p">(</span>
<span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">this_index_ops</span><span class="p">:</span> <span class="n">SeriesOrIndex</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Align the `IndexOpsMixin` objects and apply the function.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> func : The function to apply</span>
<span class="sd"> this_index_ops : IndexOpsMixin</span>
<span class="sd"> A base `IndexOpsMixin` object</span>
<span class="sd"> args : list of other arguments including other `IndexOpsMixin` objects</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> `Index` if all `this_index_ops` and arguments are `Index`; otherwise `Series`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.indexes</span> <span class="kn">import</span> <span class="n">Index</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span><span class="p">,</span> <span class="n">first_series</span>
<span class="n">cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">arg</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">IndexOpsMixin</span><span class="p">)]</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">this_index_ops</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">cols</span><span class="p">):</span>
<span class="n">combined</span> <span class="o">=</span> <span class="n">combine_frames</span><span class="p">(</span>
<span class="n">this_index_ops</span><span class="o">.</span><span class="n">to_frame</span><span class="p">(),</span>
<span class="o">*</span><span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="n">Series</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">cols</span><span class="p">)],</span>
<span class="n">how</span><span class="o">=</span><span class="s2">&quot;full&quot;</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">column_op</span><span class="p">(</span><span class="n">func</span><span class="p">)(</span>
<span class="n">combined</span><span class="p">[</span><span class="s2">&quot;this&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">combined</span><span class="p">[</span><span class="s2">&quot;this&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">[</span><span class="mi">0</span><span class="p">]),</span>
<span class="o">*</span><span class="p">[</span>
<span class="n">combined</span><span class="p">[</span><span class="s2">&quot;that&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">combined</span><span class="p">[</span><span class="s2">&quot;that&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="p">],</span>
<span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">this_index_ops</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># This could cause as many counts, reset_index calls, joins for combining</span>
<span class="c1"># as the number of `Index`s in `args`. So far it&#39;s fine since we can assume the ops</span>
<span class="c1"># only work between at most two `Index`s. We might need to fix it in the future.</span>
<span class="n">self_len</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">this_index_ops</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="o">!=</span> <span class="n">self_len</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">args</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">IndexOpsMixin</span><span class="p">)):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;operands could not be broadcast together with shapes&quot;</span><span class="p">)</span>
<span class="k">with</span> <span class="n">option_context</span><span class="p">(</span><span class="s2">&quot;compute.default_index_type&quot;</span><span class="p">,</span> <span class="s2">&quot;distributed-sequence&quot;</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">this_index_ops</span><span class="p">,</span> <span class="n">Index</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Index</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">cols</span><span class="p">):</span>
<span class="k">return</span> <span class="n">Index</span><span class="p">(</span>
<span class="n">column_op</span><span class="p">(</span><span class="n">func</span><span class="p">)(</span>
<span class="n">this_index_ops</span><span class="o">.</span><span class="n">to_series</span><span class="p">()</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">),</span>
<span class="o">*</span><span class="p">[</span>
<span class="n">arg</span><span class="o">.</span><span class="n">to_series</span><span class="p">()</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Index</span><span class="p">)</span>
<span class="k">else</span> <span class="n">arg</span>
<span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span>
<span class="p">],</span>
<span class="p">)</span><span class="o">.</span><span class="n">sort_index</span><span class="p">(),</span>
<span class="n">name</span><span class="o">=</span><span class="n">this_index_ops</span><span class="o">.</span><span class="n">name</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">this_index_ops</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="n">this</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">this_index_ops</span><span class="o">.</span><span class="n">reset_index</span><span class="p">())</span>
<span class="n">that</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">cast</span><span class="p">(</span><span class="n">Series</span><span class="p">,</span> <span class="n">col</span><span class="o">.</span><span class="n">to_series</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Index</span><span class="p">)</span> <span class="k">else</span> <span class="n">col</span><span class="p">)</span>
<span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">i</span><span class="p">)</span>
<span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span>
<span class="p">]</span>
<span class="n">combined</span> <span class="o">=</span> <span class="n">combine_frames</span><span class="p">(</span><span class="n">this</span><span class="p">,</span> <span class="o">*</span><span class="n">that</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s2">&quot;full&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">sort_index</span><span class="p">()</span>
<span class="n">combined</span> <span class="o">=</span> <span class="n">combined</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span>
<span class="n">combined</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">[:</span> <span class="n">this_index_ops</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">]</span>
<span class="p">)</span>
<span class="n">combined</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">names</span> <span class="o">=</span> <span class="n">this_index_ops</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span>
<span class="k">return</span> <span class="n">column_op</span><span class="p">(</span><span class="n">func</span><span class="p">)(</span>
<span class="n">first_series</span><span class="p">(</span><span class="n">combined</span><span class="p">[</span><span class="s2">&quot;this&quot;</span><span class="p">]),</span>
<span class="o">*</span><span class="p">[</span>
<span class="n">combined</span><span class="p">[</span><span class="s2">&quot;that&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">combined</span><span class="p">[</span><span class="s2">&quot;that&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="p">],</span>
<span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">this_index_ops</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">this</span> <span class="o">=</span> <span class="n">this_index_ops</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">that_series</span> <span class="o">=</span> <span class="nb">next</span><span class="p">(</span><span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">cols</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Series</span><span class="p">))</span>
<span class="n">that_frame</span> <span class="o">=</span> <span class="n">that_series</span><span class="o">.</span><span class="n">_psdf</span><span class="p">[</span>
<span class="p">[</span>
<span class="n">cast</span><span class="p">(</span><span class="n">Series</span><span class="p">,</span> <span class="n">col</span><span class="o">.</span><span class="n">to_series</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Index</span><span class="p">)</span> <span class="k">else</span> <span class="n">col</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">i</span><span class="p">)</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span>
<span class="p">]</span>
<span class="p">]</span>
<span class="n">combined</span> <span class="o">=</span> <span class="n">combine_frames</span><span class="p">(</span><span class="n">this</span><span class="p">,</span> <span class="n">that_frame</span><span class="o">.</span><span class="n">reset_index</span><span class="p">())</span><span class="o">.</span><span class="n">sort_index</span><span class="p">()</span>
<span class="n">self_index</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">combined</span><span class="p">[</span><span class="s2">&quot;this&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="n">combined</span><span class="p">[</span><span class="s2">&quot;this&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span><span class="o">.</span><span class="n">index</span>
<span class="p">)</span>
<span class="n">other</span> <span class="o">=</span> <span class="n">combined</span><span class="p">[</span><span class="s2">&quot;that&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span>
<span class="n">combined</span><span class="p">[</span><span class="s2">&quot;that&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">[:</span> <span class="n">that_series</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">]</span>
<span class="p">)</span>
<span class="n">other</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">names</span> <span class="o">=</span> <span class="n">that_series</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span>
<span class="k">return</span> <span class="n">column_op</span><span class="p">(</span><span class="n">func</span><span class="p">)(</span>
<span class="n">self_index</span><span class="p">,</span>
<span class="o">*</span><span class="p">[</span>
<span class="n">other</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span>
<span class="p">],</span>
<span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">that_series</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">booleanize_null</span><span class="p">(</span><span class="n">scol</span><span class="p">:</span> <span class="n">Column</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Booleanize Null in Spark Column</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">comp_ops</span> <span class="o">=</span> <span class="p">[</span>
<span class="nb">getattr</span><span class="p">(</span><span class="n">Column</span><span class="p">,</span> <span class="s2">&quot;__</span><span class="si">{}</span><span class="s2">__&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">comp_op</span><span class="p">))</span>
<span class="k">for</span> <span class="n">comp_op</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;eq&quot;</span><span class="p">,</span> <span class="s2">&quot;ne&quot;</span><span class="p">,</span> <span class="s2">&quot;lt&quot;</span><span class="p">,</span> <span class="s2">&quot;le&quot;</span><span class="p">,</span> <span class="s2">&quot;ge&quot;</span><span class="p">,</span> <span class="s2">&quot;gt&quot;</span><span class="p">]</span>
<span class="p">]</span>
<span class="k">if</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">comp_ops</span><span class="p">:</span>
<span class="c1"># if `f` is &quot;!=&quot;, fill null with True otherwise False</span>
<span class="n">filler</span> <span class="o">=</span> <span class="n">f</span> <span class="o">==</span> <span class="n">Column</span><span class="o">.</span><span class="fm">__ne__</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="n">filler</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span>
<span class="k">return</span> <span class="n">scol</span>
<span class="k">def</span> <span class="nf">column_op</span><span class="p">(</span><span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">SeriesOrIndex</span><span class="p">]:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A decorator that wraps APIs taking/returning Spark Column so that pandas-on-Spark Series can be</span>
<span class="sd"> supported too. If this decorator is used for the `f` function that takes Spark Column and</span>
<span class="sd"> returns Spark Column, decorated `f` takes pandas-on-Spark Series as well and returns</span>
<span class="sd"> pandas-on-Spark Series.</span>
<span class="sd"> :param f: a function that takes Spark Column and returns Spark Column.</span>
<span class="sd"> :param self: pandas-on-Spark Series</span>
<span class="sd"> :param args: arguments that the function `f` takes.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@wraps</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">wrapper</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">SeriesOrIndex</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.indexes.base</span> <span class="kn">import</span> <span class="n">Index</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span>
<span class="c1"># It is possible for the function `f` takes other arguments than Spark Column.</span>
<span class="c1"># To cover this case, explicitly check if the argument is pandas-on-Spark Series and</span>
<span class="c1"># extract Spark Column. For other arguments, they are used as are.</span>
<span class="n">cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">arg</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="p">(</span><span class="n">Series</span><span class="p">,</span> <span class="n">Index</span><span class="p">))]</span>
<span class="k">if</span> <span class="nb">all</span><span class="p">(</span><span class="ow">not</span> <span class="n">should_alignment_for_column_op</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">cols</span><span class="p">):</span>
<span class="c1"># Same DataFrame anchors</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">f</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span>
<span class="o">*</span><span class="p">[</span><span class="n">arg</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">IndexOpsMixin</span><span class="p">)</span> <span class="k">else</span> <span class="n">arg</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span><span class="p">],</span>
<span class="p">)</span>
<span class="n">field</span> <span class="o">=</span> <span class="n">InternalField</span><span class="o">.</span><span class="n">from_struct_field</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span><span class="o">.</span><span class="n">schema</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span>
<span class="n">use_extension_dtypes</span><span class="o">=</span><span class="nb">any</span><span class="p">(</span>
<span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n">extension_dtypes</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="p">[</span><span class="bp">self</span><span class="p">]</span> <span class="o">+</span> <span class="n">cols</span>
<span class="p">),</span>
<span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">field</span><span class="o">.</span><span class="n">is_extension_dtype</span><span class="p">:</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">booleanize_null</span><span class="p">(</span><span class="n">scol</span><span class="p">,</span> <span class="n">f</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">field</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="ow">or</span> <span class="ow">not</span> <span class="nb">any</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">cols</span><span class="p">):</span>
<span class="n">index_ops</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">scol</span><span class="p">,</span> <span class="n">field</span><span class="o">=</span><span class="n">field</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">psser</span> <span class="o">=</span> <span class="nb">next</span><span class="p">(</span><span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">cols</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Series</span><span class="p">))</span>
<span class="n">index_ops</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">scol</span><span class="p">,</span> <span class="n">field</span><span class="o">=</span><span class="n">field</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">&quot;compute.ops_on_diff_frames&quot;</span><span class="p">):</span>
<span class="n">index_ops</span> <span class="o">=</span> <span class="n">align_diff_index_ops</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">ERROR_MESSAGE_CANNOT_COMBINE</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="n">col</span><span class="o">.</span><span class="n">name</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">cols</span><span class="p">):</span>
<span class="n">index_ops</span> <span class="o">=</span> <span class="n">index_ops</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span>
<span class="k">return</span> <span class="n">index_ops</span>
<span class="k">return</span> <span class="n">wrapper</span>
<span class="k">def</span> <span class="nf">numpy_column_op</span><span class="p">(</span><span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">SeriesOrIndex</span><span class="p">]:</span>
<span class="nd">@wraps</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">wrapper</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">SeriesOrIndex</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="c1"># PySpark does not support NumPy type out of the box. For now, we convert NumPy types</span>
<span class="c1"># into some primitive types understandable in PySpark.</span>
<span class="n">new_args</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span><span class="p">:</span>
<span class="c1"># TODO: This is a quick hack to support NumPy type. We should revisit this.</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">LongType</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">timedelta64</span><span class="p">):</span>
<span class="n">new_args</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="n">arg</span> <span class="o">/</span> <span class="n">np</span><span class="o">.</span><span class="n">timedelta64</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="s2">&quot;s&quot;</span><span class="p">)))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">new_args</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span>
<span class="k">return</span> <span class="n">column_op</span><span class="p">(</span><span class="n">f</span><span class="p">)(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">new_args</span><span class="p">)</span>
<span class="k">return</span> <span class="n">wrapper</span>
<span class="k">class</span> <span class="nc">IndexOpsMixin</span><span class="p">(</span><span class="nb">object</span><span class="p">,</span> <span class="n">metaclass</span><span class="o">=</span><span class="n">ABCMeta</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;common ops mixin to support a unified interface / docs for Series / Index</span>
<span class="sd"> Assuming there are following attributes or properties and function.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@property</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_internal</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">InternalFrame</span><span class="p">:</span>
<span class="k">pass</span>
<span class="nd">@property</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_psdf</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="k">pass</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_with_new_scol</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">,</span> <span class="n">scol</span><span class="p">:</span> <span class="n">Column</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">field</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">InternalField</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="k">pass</span>
<span class="nd">@property</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_column_label</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Label</span><span class="p">]:</span>
<span class="k">pass</span>
<span class="nd">@property</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">spark</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SparkIndexOpsMethods</span><span class="p">[</span><span class="n">IndexOpsLike</span><span class="p">]:</span>
<span class="k">pass</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">_dtype_op</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataTypeOps&quot;</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.data_type_ops.base</span> <span class="kn">import</span> <span class="n">DataTypeOps</span>
<span class="k">return</span> <span class="n">DataTypeOps</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">)</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">copy</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="k">pass</span>
<span class="c1"># arithmetic operators</span>
<span class="k">def</span> <span class="fm">__neg__</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">neg</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__add__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__sub__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__mul__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">mul</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__truediv__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __truediv__ has different behaviour between pandas and PySpark for several cases.</span>
<span class="sd"> 1. When divide np.inf by zero, PySpark returns null whereas pandas returns np.inf</span>
<span class="sd"> 2. When divide positive number by zero, PySpark returns null whereas pandas returns np.inf</span>
<span class="sd"> 3. When divide -np.inf by zero, PySpark returns null whereas pandas returns -np.inf</span>
<span class="sd"> 4. When divide negative number by zero, PySpark returns null whereas pandas returns -np.inf</span>
<span class="sd"> +-------------------------------------------+</span>
<span class="sd"> | dividend (divisor: 0) | PySpark | pandas |</span>
<span class="sd"> |-----------------------|---------|---------|</span>
<span class="sd"> | np.inf | null | np.inf |</span>
<span class="sd"> | -np.inf | null | -np.inf |</span>
<span class="sd"> | 10 | null | np.inf |</span>
<span class="sd"> | -10 | null | -np.inf |</span>
<span class="sd"> +-----------------------|---------|---------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">truediv</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__mod__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">mod</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__radd__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">radd</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__rsub__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">rsub</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__rmul__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">rmul</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__rtruediv__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">rtruediv</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__floordiv__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __floordiv__ has different behaviour between pandas and PySpark for several cases.</span>
<span class="sd"> 1. When divide np.inf by zero, PySpark returns null whereas pandas returns np.inf</span>
<span class="sd"> 2. When divide positive number by zero, PySpark returns null whereas pandas returns np.inf</span>
<span class="sd"> 3. When divide -np.inf by zero, PySpark returns null whereas pandas returns -np.inf</span>
<span class="sd"> 4. When divide negative number by zero, PySpark returns null whereas pandas returns -np.inf</span>
<span class="sd"> +-------------------------------------------+</span>
<span class="sd"> | dividend (divisor: 0) | PySpark | pandas |</span>
<span class="sd"> |-----------------------|---------|---------|</span>
<span class="sd"> | np.inf | null | np.inf |</span>
<span class="sd"> | -np.inf | null | -np.inf |</span>
<span class="sd"> | 10 | null | np.inf |</span>
<span class="sd"> | -10 | null | -np.inf |</span>
<span class="sd"> +-----------------------|---------|---------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">floordiv</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__rfloordiv__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">rfloordiv</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__rmod__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">rmod</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__pow__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">pow</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__rpow__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">rpow</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__abs__</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">abs</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="c1"># comparison operators</span>
<span class="k">def</span> <span class="fm">__eq__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span> <span class="c1"># type: ignore[override]</span>
<span class="c1"># pandas always returns False for all items with dict and set.</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="p">(</span><span class="nb">dict</span><span class="p">,</span> <span class="nb">set</span><span class="p">)):</span>
<span class="k">return</span> <span class="bp">self</span> <span class="o">!=</span> <span class="bp">self</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">eq</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__ne__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span> <span class="c1"># type: ignore[override]</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">ne</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__lt__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">lt</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__le__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">le</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__ge__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">ge</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__gt__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">gt</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__invert__</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">invert</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="c1"># `and`, `or`, `not` cannot be overloaded in Python,</span>
<span class="c1"># so use bitwise operators as boolean operators</span>
<span class="k">def</span> <span class="fm">__and__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="fm">__and__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__or__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="fm">__or__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__rand__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__ror__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">ror</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__xor__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">xor</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__rxor__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">rxor</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__len__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="k">return</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">)</span>
<span class="c1"># NDArray Compat</span>
<span class="k">def</span> <span class="nf">__array_ufunc__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">ufunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">,</span> <span class="n">method</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="o">*</span><span class="n">inputs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas</span> <span class="kn">import</span> <span class="n">numpy_compat</span>
<span class="c1"># Try dunder methods first.</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">numpy_compat</span><span class="o">.</span><span class="n">maybe_dispatch_ufunc_to_dunder_op</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">ufunc</span><span class="p">,</span> <span class="n">method</span><span class="p">,</span> <span class="o">*</span><span class="n">inputs</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span>
<span class="p">)</span>
<span class="c1"># After that, we try with PySpark APIs.</span>
<span class="k">if</span> <span class="n">result</span> <span class="ow">is</span> <span class="bp">NotImplemented</span><span class="p">:</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">numpy_compat</span><span class="o">.</span><span class="n">maybe_dispatch_ufunc_to_spark_func</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">ufunc</span><span class="p">,</span> <span class="n">method</span><span class="p">,</span> <span class="o">*</span><span class="n">inputs</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">result</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">NotImplemented</span><span class="p">:</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">SeriesOrIndex</span><span class="p">,</span> <span class="n">result</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># TODO: support more APIs?</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span>
<span class="s2">&quot;pandas-on-Spark objects currently do not support </span><span class="si">%s</span><span class="s2">.&quot;</span> <span class="o">%</span> <span class="n">ufunc</span>
<span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">dtype</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Dtype</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;Return the dtype object of the underlying data.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([1, 2, 3])</span>
<span class="sd"> &gt;&gt;&gt; s.dtype</span>
<span class="sd"> dtype(&#39;int64&#39;)</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series(list(&#39;abc&#39;))</span>
<span class="sd"> &gt;&gt;&gt; s.dtype</span>
<span class="sd"> dtype(&#39;O&#39;)</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series(pd.date_range(&#39;20130101&#39;, periods=3))</span>
<span class="sd"> &gt;&gt;&gt; s.dtype</span>
<span class="sd"> dtype(&#39;&lt;M8[ns]&#39;)</span>
<span class="sd"> &gt;&gt;&gt; s.rename(&quot;a&quot;).to_frame().set_index(&quot;a&quot;).index.dtype</span>
<span class="sd"> dtype(&#39;&lt;M8[ns]&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">dtype</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">empty</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns true if the current object is empty. Otherwise, returns false.</span>
<span class="sd"> &gt;&gt;&gt; ps.range(10).id.empty</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.range(0).id.empty</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.DataFrame({}, index=list(&#39;abc&#39;)).index.empty</span>
<span class="sd"> False</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">rdd</span><span class="o">.</span><span class="n">isEmpty</span><span class="p">()</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">hasnans</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return True if it has any missing values. Otherwise, it returns False.</span>
<span class="sd"> &gt;&gt;&gt; ps.DataFrame({}, index=list(&#39;abc&#39;)).index.hasnans</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([&#39;a&#39;, None]).hasnans</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1.0, 2.0, np.nan]).hasnans</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1, 2, 3]).hasnans</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; (ps.Series([1.0, 2.0, np.nan]) + 1).hasnans</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1, 2, 3]).rename(&quot;a&quot;).to_frame().set_index(&quot;a&quot;).index.hasnans</span>
<span class="sd"> False</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">isnull</span><span class="p">()</span><span class="o">.</span><span class="n">any</span><span class="p">()</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">is_monotonic</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return boolean if values in the object are monotonically increasing.</span>
<span class="sd"> .. note:: the current implementation of is_monotonic requires to shuffle</span>
<span class="sd"> and aggregate multiple times to check the order locally and globally,</span>
<span class="sd"> which is potentially expensive. In case of multi-index, all data are</span>
<span class="sd"> transferred to single node which can easily cause out-of-memory error currently.</span>
<span class="sd"> .. note:: Disable the Spark config `spark.sql.optimizer.nestedSchemaPruning.enabled`</span>
<span class="sd"> for multi-index if you&#39;re using pandas-on-Spark &lt; 1.7.0 with PySpark 3.1.1.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> is_monotonic : bool</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([&#39;1/1/2018&#39;, &#39;3/1/2018&#39;, &#39;4/1/2018&#39;])</span>
<span class="sd"> &gt;&gt;&gt; ser.is_monotonic</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;dates&#39;: [None, &#39;1/1/2018&#39;, &#39;2/1/2018&#39;, &#39;3/1/2018&#39;]})</span>
<span class="sd"> &gt;&gt;&gt; df.dates.is_monotonic</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; df.index.is_monotonic</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([1])</span>
<span class="sd"> &gt;&gt;&gt; ser.is_monotonic</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([])</span>
<span class="sd"> &gt;&gt;&gt; ser.is_monotonic</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ser.rename(&quot;a&quot;).to_frame().set_index(&quot;a&quot;).index.is_monotonic</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([5, 4, 3, 2, 1], index=[1, 2, 3, 4, 5])</span>
<span class="sd"> &gt;&gt;&gt; ser.is_monotonic</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ser.index.is_monotonic</span>
<span class="sd"> True</span>
<span class="sd"> Support for MultiIndex</span>
<span class="sd"> &gt;&gt;&gt; midx = ps.MultiIndex.from_tuples(</span>
<span class="sd"> ... [(&#39;x&#39;, &#39;a&#39;), (&#39;x&#39;, &#39;b&#39;), (&#39;y&#39;, &#39;c&#39;), (&#39;y&#39;, &#39;d&#39;), (&#39;z&#39;, &#39;e&#39;)])</span>
<span class="sd"> &gt;&gt;&gt; midx # doctest: +SKIP</span>
<span class="sd"> MultiIndex([(&#39;x&#39;, &#39;a&#39;),</span>
<span class="sd"> (&#39;x&#39;, &#39;b&#39;),</span>
<span class="sd"> (&#39;y&#39;, &#39;c&#39;),</span>
<span class="sd"> (&#39;y&#39;, &#39;d&#39;),</span>
<span class="sd"> (&#39;z&#39;, &#39;e&#39;)],</span>
<span class="sd"> )</span>
<span class="sd"> &gt;&gt;&gt; midx.is_monotonic</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; midx = ps.MultiIndex.from_tuples(</span>
<span class="sd"> ... [(&#39;z&#39;, &#39;a&#39;), (&#39;z&#39;, &#39;b&#39;), (&#39;y&#39;, &#39;c&#39;), (&#39;y&#39;, &#39;d&#39;), (&#39;x&#39;, &#39;e&#39;)])</span>
<span class="sd"> &gt;&gt;&gt; midx # doctest: +SKIP</span>
<span class="sd"> MultiIndex([(&#39;z&#39;, &#39;a&#39;),</span>
<span class="sd"> (&#39;z&#39;, &#39;b&#39;),</span>
<span class="sd"> (&#39;y&#39;, &#39;c&#39;),</span>
<span class="sd"> (&#39;y&#39;, &#39;d&#39;),</span>
<span class="sd"> (&#39;x&#39;, &#39;e&#39;)],</span>
<span class="sd"> )</span>
<span class="sd"> &gt;&gt;&gt; midx.is_monotonic</span>
<span class="sd"> False</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_is_monotonic</span><span class="p">(</span><span class="s2">&quot;increasing&quot;</span><span class="p">)</span>
<span class="n">is_monotonic_increasing</span> <span class="o">=</span> <span class="n">is_monotonic</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">is_monotonic_decreasing</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return boolean if values in the object are monotonically decreasing.</span>
<span class="sd"> .. note:: the current implementation of is_monotonic_decreasing requires to shuffle</span>
<span class="sd"> and aggregate multiple times to check the order locally and globally,</span>
<span class="sd"> which is potentially expensive. In case of multi-index, all data are transferred</span>
<span class="sd"> to single node which can easily cause out-of-memory error currently.</span>
<span class="sd"> .. note:: Disable the Spark config `spark.sql.optimizer.nestedSchemaPruning.enabled`</span>
<span class="sd"> for multi-index if you&#39;re using pandas-on-Spark &lt; 1.7.0 with PySpark 3.1.1.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> is_monotonic : bool</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([&#39;4/1/2018&#39;, &#39;3/1/2018&#39;, &#39;1/1/2018&#39;])</span>
<span class="sd"> &gt;&gt;&gt; ser.is_monotonic_decreasing</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;dates&#39;: [None, &#39;3/1/2018&#39;, &#39;2/1/2018&#39;, &#39;1/1/2018&#39;]})</span>
<span class="sd"> &gt;&gt;&gt; df.dates.is_monotonic_decreasing</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; df.index.is_monotonic_decreasing</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([1])</span>
<span class="sd"> &gt;&gt;&gt; ser.is_monotonic_decreasing</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([])</span>
<span class="sd"> &gt;&gt;&gt; ser.is_monotonic_decreasing</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ser.rename(&quot;a&quot;).to_frame().set_index(&quot;a&quot;).index.is_monotonic_decreasing</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([5, 4, 3, 2, 1], index=[1, 2, 3, 4, 5])</span>
<span class="sd"> &gt;&gt;&gt; ser.is_monotonic_decreasing</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ser.index.is_monotonic_decreasing</span>
<span class="sd"> False</span>
<span class="sd"> Support for MultiIndex</span>
<span class="sd"> &gt;&gt;&gt; midx = ps.MultiIndex.from_tuples(</span>
<span class="sd"> ... [(&#39;x&#39;, &#39;a&#39;), (&#39;x&#39;, &#39;b&#39;), (&#39;y&#39;, &#39;c&#39;), (&#39;y&#39;, &#39;d&#39;), (&#39;z&#39;, &#39;e&#39;)])</span>
<span class="sd"> &gt;&gt;&gt; midx # doctest: +SKIP</span>
<span class="sd"> MultiIndex([(&#39;x&#39;, &#39;a&#39;),</span>
<span class="sd"> (&#39;x&#39;, &#39;b&#39;),</span>
<span class="sd"> (&#39;y&#39;, &#39;c&#39;),</span>
<span class="sd"> (&#39;y&#39;, &#39;d&#39;),</span>
<span class="sd"> (&#39;z&#39;, &#39;e&#39;)],</span>
<span class="sd"> )</span>
<span class="sd"> &gt;&gt;&gt; midx.is_monotonic_decreasing</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; midx = ps.MultiIndex.from_tuples(</span>
<span class="sd"> ... [(&#39;z&#39;, &#39;e&#39;), (&#39;z&#39;, &#39;d&#39;), (&#39;y&#39;, &#39;c&#39;), (&#39;y&#39;, &#39;b&#39;), (&#39;x&#39;, &#39;a&#39;)])</span>
<span class="sd"> &gt;&gt;&gt; midx # doctest: +SKIP</span>
<span class="sd"> MultiIndex([(&#39;z&#39;, &#39;a&#39;),</span>
<span class="sd"> (&#39;z&#39;, &#39;b&#39;),</span>
<span class="sd"> (&#39;y&#39;, &#39;c&#39;),</span>
<span class="sd"> (&#39;y&#39;, &#39;d&#39;),</span>
<span class="sd"> (&#39;x&#39;, &#39;e&#39;)],</span>
<span class="sd"> )</span>
<span class="sd"> &gt;&gt;&gt; midx.is_monotonic_decreasing</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_is_monotonic</span><span class="p">(</span><span class="s2">&quot;decreasing&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_is_locally_monotonic_spark_column</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">order</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">window</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__partition_id&quot;</span><span class="p">))</span>
<span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span>
<span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">order</span> <span class="o">==</span> <span class="s2">&quot;increasing&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__origin&quot;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">F</span><span class="o">.</span><span class="n">lag</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__origin&quot;</span><span class="p">),</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">))</span> <span class="o">&amp;</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span>
<span class="s2">&quot;__origin&quot;</span>
<span class="p">)</span><span class="o">.</span><span class="n">isNotNull</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__origin&quot;</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="n">F</span><span class="o">.</span><span class="n">lag</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__origin&quot;</span><span class="p">),</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">))</span> <span class="o">&amp;</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span>
<span class="s2">&quot;__origin&quot;</span>
<span class="p">)</span><span class="o">.</span><span class="n">isNotNull</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">_is_monotonic</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">order</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="k">assert</span> <span class="n">order</span> <span class="ow">in</span> <span class="p">(</span><span class="s2">&quot;increasing&quot;</span><span class="p">,</span> <span class="s2">&quot;decreasing&quot;</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">spark_partition_id</span><span class="p">()</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span>
<span class="s2">&quot;__partition_id&quot;</span>
<span class="p">),</span> <span class="c1"># Make sure we use the same partition id in the whole job.</span>
<span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">),</span>
<span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;__origin&quot;</span><span class="p">),</span>
<span class="p">)</span>
<span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__partition_id&quot;</span><span class="p">),</span>
<span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__origin&quot;</span><span class="p">),</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_is_locally_monotonic_spark_column</span><span class="p">(</span><span class="n">order</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span>
<span class="s2">&quot;__comparison_within_partition&quot;</span>
<span class="p">),</span>
<span class="p">)</span>
<span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__partition_id&quot;</span><span class="p">))</span>
<span class="o">.</span><span class="n">agg</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__origin&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;__partition_min&quot;</span><span class="p">),</span>
<span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__origin&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;__partition_max&quot;</span><span class="p">),</span>
<span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__comparison_within_partition&quot;</span><span class="p">),</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">True</span><span class="p">)))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span>
<span class="s2">&quot;__comparison_within_partition&quot;</span>
<span class="p">),</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="c1"># Now we&#39;re windowing the aggregation results without partition specification.</span>
<span class="c1"># The number of rows here will be as the same of partitions, which is expected</span>
<span class="c1"># to be small.</span>
<span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__partition_id&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span>
<span class="k">if</span> <span class="n">order</span> <span class="o">==</span> <span class="s2">&quot;increasing&quot;</span><span class="p">:</span>
<span class="n">comparison_col</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__partition_min&quot;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">F</span><span class="o">.</span><span class="n">lag</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__partition_max&quot;</span><span class="p">),</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">over</span><span class="p">(</span>
<span class="n">window</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">comparison_col</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__partition_min&quot;</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="n">F</span><span class="o">.</span><span class="n">lag</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__partition_max&quot;</span><span class="p">),</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">over</span><span class="p">(</span>
<span class="n">window</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="n">comparison_col</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;__comparison_between_partitions&quot;</span><span class="p">),</span>
<span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__comparison_within_partition&quot;</span><span class="p">),</span>
<span class="p">)</span>
<span class="n">ret</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__comparison_between_partitions&quot;</span><span class="p">),</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">True</span><span class="p">)))</span>
<span class="o">&amp;</span> <span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__comparison_within_partition&quot;</span><span class="p">),</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">True</span><span class="p">)))</span>
<span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span>
<span class="k">if</span> <span class="n">ret</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">True</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">ret</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">ndim</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return an int representing the number of array dimensions.</span>
<span class="sd"> Return 1 for Series / Index / MultiIndex.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> For Series</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([None, 1, 2, 3, 4], index=[4, 5, 2, 1, 8])</span>
<span class="sd"> &gt;&gt;&gt; s.ndim</span>
<span class="sd"> 1</span>
<span class="sd"> For Index</span>
<span class="sd"> &gt;&gt;&gt; s.index.ndim</span>
<span class="sd"> 1</span>
<span class="sd"> For MultiIndex</span>
<span class="sd"> &gt;&gt;&gt; midx = pd.MultiIndex([[&#39;lama&#39;, &#39;cow&#39;, &#39;falcon&#39;],</span>
<span class="sd"> ... [&#39;speed&#39;, &#39;weight&#39;, &#39;length&#39;]],</span>
<span class="sd"> ... [[0, 0, 0, 1, 1, 1, 2, 2, 2],</span>
<span class="sd"> ... [1, 1, 1, 1, 1, 2, 1, 2, 2]])</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)</span>
<span class="sd"> &gt;&gt;&gt; s.index.ndim</span>
<span class="sd"> 1</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="mi">1</span>
<span class="k">def</span> <span class="nf">astype</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">,</span> <span class="n">dtype</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">type</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Cast a pandas-on-Spark object to a specified dtype ``dtype``.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> dtype : data type</span>
<span class="sd"> Use a numpy.dtype or Python type to cast entire pandas object to</span>
<span class="sd"> the same type.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> casted : same type as caller</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> to_datetime : Convert argument to datetime.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([1, 2], dtype=&#39;int32&#39;)</span>
<span class="sd"> &gt;&gt;&gt; ser</span>
<span class="sd"> 0 1</span>
<span class="sd"> 1 2</span>
<span class="sd"> dtype: int32</span>
<span class="sd"> &gt;&gt;&gt; ser.astype(&#39;int64&#39;)</span>
<span class="sd"> 0 1</span>
<span class="sd"> 1 2</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; ser.rename(&quot;a&quot;).to_frame().set_index(&quot;a&quot;).index.astype(&#39;int64&#39;)</span>
<span class="sd"> Int64Index([1, 2], dtype=&#39;int64&#39;, name=&#39;a&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dtype</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">isin</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">,</span> <span class="n">values</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Any</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Check whether `values` are contained in Series or Index.</span>
<span class="sd"> Return a boolean Series or Index showing whether each element in the Series</span>
<span class="sd"> matches an element in the passed sequence of `values` exactly.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> values : set or list-like</span>
<span class="sd"> The sequence of values to test.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> isin : Series (bool dtype) or Index (bool dtype)</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([&#39;lama&#39;, &#39;cow&#39;, &#39;lama&#39;, &#39;beetle&#39;, &#39;lama&#39;,</span>
<span class="sd"> ... &#39;hippo&#39;], name=&#39;animal&#39;)</span>
<span class="sd"> &gt;&gt;&gt; s.isin([&#39;cow&#39;, &#39;lama&#39;])</span>
<span class="sd"> 0 True</span>
<span class="sd"> 1 True</span>
<span class="sd"> 2 True</span>
<span class="sd"> 3 False</span>
<span class="sd"> 4 True</span>
<span class="sd"> 5 False</span>
<span class="sd"> Name: animal, dtype: bool</span>
<span class="sd"> Passing a single string as ``s.isin(&#39;lama&#39;)`` will raise an error. Use</span>
<span class="sd"> a list of one element instead:</span>
<span class="sd"> &gt;&gt;&gt; s.isin([&#39;lama&#39;])</span>
<span class="sd"> 0 True</span>
<span class="sd"> 1 False</span>
<span class="sd"> 2 True</span>
<span class="sd"> 3 False</span>
<span class="sd"> 4 True</span>
<span class="sd"> 5 False</span>
<span class="sd"> Name: animal, dtype: bool</span>
<span class="sd"> &gt;&gt;&gt; s.rename(&quot;a&quot;).to_frame().set_index(&quot;a&quot;).index.isin([&#39;lama&#39;])</span>
<span class="sd"> Index([True, False, True, False, True, False], dtype=&#39;object&#39;, name=&#39;a&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">values</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;only list-like objects are allowed to be passed&quot;</span>
<span class="s2">&quot; to isin(), you passed a [</span><span class="si">{values_type}</span><span class="s2">]&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">values_type</span><span class="o">=</span><span class="nb">type</span><span class="p">(</span><span class="n">values</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="p">)</span>
<span class="n">values</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">cast</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">values</span><span class="p">)</span><span class="o">.</span><span class="n">tolist</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">)</span> <span class="k">else</span> <span class="nb">list</span><span class="p">(</span><span class="n">values</span><span class="p">)</span>
<span class="p">)</span>
<span class="n">other</span> <span class="o">=</span> <span class="p">[</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">v</span><span class="p">)</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">values</span><span class="p">]</span>
<span class="n">scol</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">isin</span><span class="p">(</span><span class="n">other</span><span class="p">)</span>
<span class="n">field</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span>
<span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">dtype</span><span class="p">(</span><span class="s2">&quot;bool&quot;</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">=</span><span class="n">BooleanType</span><span class="p">(),</span> <span class="n">nullable</span><span class="o">=</span><span class="kc">False</span>
<span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">scol</span><span class="o">=</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">scol</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">)),</span> <span class="n">field</span><span class="o">=</span><span class="n">field</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">isnull</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Detect existing (non-missing) values.</span>
<span class="sd"> Return a boolean same-sized object indicating if the values are NA.</span>
<span class="sd"> NA values, such as None or numpy.NaN, gets mapped to True values.</span>
<span class="sd"> Everything else gets mapped to False values. Characters such as empty strings &#39;&#39; or</span>
<span class="sd"> numpy.inf are not considered NA values</span>
<span class="sd"> (unless you set pandas.options.mode.use_inf_as_na = True).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or Index : Mask of bool values for each element in Series</span>
<span class="sd"> that indicates whether an element is not an NA value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([5, 6, np.NaN])</span>
<span class="sd"> &gt;&gt;&gt; ser.isna() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> 0 False</span>
<span class="sd"> 1 False</span>
<span class="sd"> 2 True</span>
<span class="sd"> dtype: bool</span>
<span class="sd"> &gt;&gt;&gt; ser.rename(&quot;a&quot;).to_frame().set_index(&quot;a&quot;).index.isna()</span>
<span class="sd"> Index([False, False, True], dtype=&#39;object&#39;, name=&#39;a&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.indexes</span> <span class="kn">import</span> <span class="n">MultiIndex</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">MultiIndex</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;isna is not defined for MultiIndex&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">isnull</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="n">isna</span> <span class="o">=</span> <span class="n">isnull</span>
<span class="k">def</span> <span class="nf">notnull</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Detect existing (non-missing) values.</span>
<span class="sd"> Return a boolean same-sized object indicating if the values are not NA.</span>
<span class="sd"> Non-missing values get mapped to True.</span>
<span class="sd"> Characters such as empty strings &#39;&#39; or numpy.inf are not considered NA values</span>
<span class="sd"> (unless you set pandas.options.mode.use_inf_as_na = True).</span>
<span class="sd"> NA values, such as None or numpy.NaN, get mapped to False values.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or Index : Mask of bool values for each element in Series</span>
<span class="sd"> that indicates whether an element is not an NA value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Show which entries in a Series are not NA.</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([5, 6, np.NaN])</span>
<span class="sd"> &gt;&gt;&gt; ser</span>
<span class="sd"> 0 5.0</span>
<span class="sd"> 1 6.0</span>
<span class="sd"> 2 NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; ser.notna()</span>
<span class="sd"> 0 True</span>
<span class="sd"> 1 True</span>
<span class="sd"> 2 False</span>
<span class="sd"> dtype: bool</span>
<span class="sd"> &gt;&gt;&gt; ser.rename(&quot;a&quot;).to_frame().set_index(&quot;a&quot;).index.notna()</span>
<span class="sd"> Index([True, True, False], dtype=&#39;object&#39;, name=&#39;a&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.indexes</span> <span class="kn">import</span> <span class="n">MultiIndex</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">MultiIndex</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;notna is not defined for MultiIndex&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="p">(</span><span class="o">~</span><span class="bp">self</span><span class="o">.</span><span class="n">isnull</span><span class="p">())</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> <span class="c1"># type: ignore[attr-defined]</span>
<span class="n">notna</span> <span class="o">=</span> <span class="n">notnull</span>
<span class="c1"># TODO: axis and many arguments should be implemented.</span>
<span class="k">def</span> <span class="nf">all</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return whether all elements are True.</span>
<span class="sd"> Returns True unless there at least one element within a series that is</span>
<span class="sd"> False or equivalent (e.g. zero or empty)</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {0 or &#39;index&#39;}, default 0</span>
<span class="sd"> Indicate which axis or axes should be reduced.</span>
<span class="sd"> * 0 / &#39;index&#39; : reduce the index, return a Series whose index is the</span>
<span class="sd"> original column labels.</span>
<span class="sd"> skipna : boolean, default True</span>
<span class="sd"> Exclude NA/null values. If an entire row/column is NA and skipna is True,</span>
<span class="sd"> then the result will be True, as for an empty row/column.</span>
<span class="sd"> If skipna is False, then NA are treated as True, because these are not equal to zero.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([True, True]).all()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([True, False]).all()</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([0, 1]).all()</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1, 2, 3]).all()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([True, True, None]).all()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([True, True, None]).all(skipna=False)</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([True, False, None]).all()</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([]).all()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([np.nan]).all()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([np.nan]).all(skipna=False)</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([None]).all()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([None]).all(skipna=False)</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; df = ps.Series([True, False, None]).rename(&quot;a&quot;).to_frame()</span>
<span class="sd"> &gt;&gt;&gt; df.set_index(&quot;a&quot;).index.all()</span>
<span class="sd"> False</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s1">&#39;axis should be either 0 or &quot;index&quot; currently.&#39;</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="c1"># `any` and `every` was added as of Spark 3.0.</span>
<span class="c1"># ret = sdf.select(F.expr(&quot;every(CAST(`%s` AS BOOLEAN))&quot; % sdf.columns[0])).collect()[0][0]</span>
<span class="c1"># We use min as its alternative as below.</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">)</span> <span class="ow">or</span> <span class="n">skipna</span><span class="p">:</span>
<span class="c1"># np.nan takes no effect to the result; None takes no effect if `skipna`</span>
<span class="n">ret</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">&quot;boolean&quot;</span><span class="p">),</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">True</span><span class="p">))))</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># Take None as False when not `skipna`</span>
<span class="n">ret</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">))</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">&quot;boolean&quot;</span><span class="p">)))</span>
<span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span>
<span class="k">if</span> <span class="n">ret</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">True</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">ret</span>
<span class="c1"># TODO: axis, skipna, and many arguments should be implemented.</span>
<span class="k">def</span> <span class="nf">any</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return whether any element is True.</span>
<span class="sd"> Returns False unless there at least one element within a series that is</span>
<span class="sd"> True or equivalent (e.g. non-zero or non-empty).</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {0 or &#39;index&#39;}, default 0</span>
<span class="sd"> Indicate which axis or axes should be reduced.</span>
<span class="sd"> * 0 / &#39;index&#39; : reduce the index, return a Series whose index is the</span>
<span class="sd"> original column labels.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([False, False]).any()</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([True, False]).any()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([0, 0]).any()</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([0, 1, 2]).any()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([False, False, None]).any()</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([True, False, None]).any()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([]).any()</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([np.nan]).any()</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; df = ps.Series([True, False, None]).rename(&quot;a&quot;).to_frame()</span>
<span class="sd"> &gt;&gt;&gt; df.set_index(&quot;a&quot;).index.any()</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s1">&#39;axis should be either 0 or &quot;index&quot; currently.&#39;</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="c1"># Note that we&#39;re ignoring `None`s here for now.</span>
<span class="c1"># any and every was added as of Spark 3.0</span>
<span class="c1"># ret = sdf.select(F.expr(&quot;any(CAST(`%s` AS BOOLEAN))&quot; % sdf.columns[0])).collect()[0][0]</span>
<span class="c1"># Here we use max as its alternative:</span>
<span class="n">ret</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">&quot;boolean&quot;</span><span class="p">),</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">))))</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span>
<span class="k">if</span> <span class="n">ret</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">False</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">ret</span>
<span class="c1"># TODO: add frep and axis parameter</span>
<span class="k">def</span> <span class="nf">shift</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">,</span> <span class="n">periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">fill_value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Shift Series/Index by desired number of periods.</span>
<span class="sd"> .. note:: the current implementation of shift uses Spark&#39;s Window without</span>
<span class="sd"> specifying partition specification. This leads to move all data into</span>
<span class="sd"> single partition in single machine and could cause serious</span>
<span class="sd"> performance degradation. Avoid this method against very large dataset.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> periods : int</span>
<span class="sd"> Number of periods to shift. Can be positive or negative.</span>
<span class="sd"> fill_value : object, optional</span>
<span class="sd"> The scalar value to use for newly introduced missing values.</span>
<span class="sd"> The default depends on the dtype of self. For numeric data, np.nan is used.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Copy of input Series/Index, shifted.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;Col1&#39;: [10, 20, 15, 30, 45],</span>
<span class="sd"> ... &#39;Col2&#39;: [13, 23, 18, 33, 48],</span>
<span class="sd"> ... &#39;Col3&#39;: [17, 27, 22, 37, 52]},</span>
<span class="sd"> ... columns=[&#39;Col1&#39;, &#39;Col2&#39;, &#39;Col3&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.Col1.shift(periods=3)</span>
<span class="sd"> 0 NaN</span>
<span class="sd"> 1 NaN</span>
<span class="sd"> 2 NaN</span>
<span class="sd"> 3 10.0</span>
<span class="sd"> 4 20.0</span>
<span class="sd"> Name: Col1, dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.Col2.shift(periods=3, fill_value=0)</span>
<span class="sd"> 0 0</span>
<span class="sd"> 1 0</span>
<span class="sd"> 2 0</span>
<span class="sd"> 3 13</span>
<span class="sd"> 4 23</span>
<span class="sd"> Name: Col2, dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; df.index.shift(periods=3, fill_value=0)</span>
<span class="sd"> Int64Index([0, 0, 0, 0, 1], dtype=&#39;int64&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_shift</span><span class="p">(</span><span class="n">periods</span><span class="p">,</span> <span class="n">fill_value</span><span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">analyzed</span>
<span class="k">def</span> <span class="nf">_shift</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">,</span>
<span class="n">periods</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
<span class="n">fill_value</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">part_cols</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="p">(),</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">periods</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;periods should be an int; however, got [</span><span class="si">%s</span><span class="s2">]&quot;</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">periods</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="n">col</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="n">window</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">part_cols</span><span class="p">)</span>
<span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span>
<span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span><span class="o">-</span><span class="n">periods</span><span class="p">,</span> <span class="o">-</span><span class="n">periods</span><span class="p">)</span>
<span class="p">)</span>
<span class="n">lag_col</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">lag</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">periods</span><span class="p">)</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">)</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">lag_col</span><span class="o">.</span><span class="n">isNull</span><span class="p">()</span> <span class="o">|</span> <span class="n">F</span><span class="o">.</span><span class="n">isnan</span><span class="p">(</span><span class="n">lag_col</span><span class="p">),</span> <span class="n">fill_value</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">lag_col</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">field</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">nullable</span><span class="o">=</span><span class="kc">True</span><span class="p">))</span>
<span class="c1"># TODO: Update Documentation for Bins Parameter when its supported</span>
<span class="k">def</span> <span class="nf">value_counts</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">normalize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">sort</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">bins</span><span class="p">:</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Series&quot;</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return a Series containing counts of unique values.</span>
<span class="sd"> The resulting object will be in descending order so that the</span>
<span class="sd"> first element is the most frequently-occurring element.</span>
<span class="sd"> Excludes NA values by default.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> normalize : boolean, default False</span>
<span class="sd"> If True then the object returned will contain the relative</span>
<span class="sd"> frequencies of the unique values.</span>
<span class="sd"> sort : boolean, default True</span>
<span class="sd"> Sort by values.</span>
<span class="sd"> ascending : boolean, default False</span>
<span class="sd"> Sort in ascending order.</span>
<span class="sd"> bins : Not Yet Supported</span>
<span class="sd"> dropna : boolean, default True</span>
<span class="sd"> Don&#39;t include counts of NaN.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> counts : Series</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.count: Number of non-NA elements in a Series.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> For Series</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;x&#39;:[0, 0, 1, 1, 1, np.nan]})</span>
<span class="sd"> &gt;&gt;&gt; df.x.value_counts() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> 1.0 3</span>
<span class="sd"> 0.0 2</span>
<span class="sd"> Name: x, dtype: int64</span>
<span class="sd"> With `normalize` set to `True`, returns the relative frequency by</span>
<span class="sd"> dividing all values by the sum of values.</span>
<span class="sd"> &gt;&gt;&gt; df.x.value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> 1.0 0.6</span>
<span class="sd"> 0.0 0.4</span>
<span class="sd"> Name: x, dtype: float64</span>
<span class="sd"> **dropna**</span>
<span class="sd"> With `dropna` set to `False` we can also see NaN index values.</span>
<span class="sd"> &gt;&gt;&gt; df.x.value_counts(dropna=False) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> 1.0 3</span>
<span class="sd"> 0.0 2</span>
<span class="sd"> NaN 1</span>
<span class="sd"> Name: x, dtype: int64</span>
<span class="sd"> For Index</span>
<span class="sd"> &gt;&gt;&gt; idx = ps.Index([3, 1, 2, 3, 4, np.nan])</span>
<span class="sd"> &gt;&gt;&gt; idx</span>
<span class="sd"> Float64Index([3.0, 1.0, 2.0, 3.0, 4.0, nan], dtype=&#39;float64&#39;)</span>
<span class="sd"> &gt;&gt;&gt; idx.value_counts().sort_index()</span>
<span class="sd"> 1.0 1</span>
<span class="sd"> 2.0 1</span>
<span class="sd"> 3.0 2</span>
<span class="sd"> 4.0 1</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> **sort**</span>
<span class="sd"> With `sort` set to `False`, the result wouldn&#39;t be sorted by number of count.</span>
<span class="sd"> &gt;&gt;&gt; idx.value_counts(sort=True).sort_index()</span>
<span class="sd"> 1.0 1</span>
<span class="sd"> 2.0 1</span>
<span class="sd"> 3.0 2</span>
<span class="sd"> 4.0 1</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> **normalize**</span>
<span class="sd"> With `normalize` set to `True`, returns the relative frequency by</span>
<span class="sd"> dividing all values by the sum of values.</span>
<span class="sd"> &gt;&gt;&gt; idx.value_counts(normalize=True).sort_index()</span>
<span class="sd"> 1.0 0.2</span>
<span class="sd"> 2.0 0.2</span>
<span class="sd"> 3.0 0.4</span>
<span class="sd"> 4.0 0.2</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> **dropna**</span>
<span class="sd"> With `dropna` set to `False` we can also see NaN index values.</span>
<span class="sd"> &gt;&gt;&gt; idx.value_counts(dropna=False).sort_index() # doctest: +SKIP</span>
<span class="sd"> 1.0 1</span>
<span class="sd"> 2.0 1</span>
<span class="sd"> 3.0 2</span>
<span class="sd"> 4.0 1</span>
<span class="sd"> NaN 1</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> For MultiIndex.</span>
<span class="sd"> &gt;&gt;&gt; midx = pd.MultiIndex([[&#39;lama&#39;, &#39;cow&#39;, &#39;falcon&#39;],</span>
<span class="sd"> ... [&#39;speed&#39;, &#39;weight&#39;, &#39;length&#39;]],</span>
<span class="sd"> ... [[0, 0, 0, 1, 1, 1, 2, 2, 2],</span>
<span class="sd"> ... [1, 1, 1, 1, 1, 2, 1, 2, 2]])</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)</span>
<span class="sd"> &gt;&gt;&gt; s.index # doctest: +SKIP</span>
<span class="sd"> MultiIndex([( &#39;lama&#39;, &#39;weight&#39;),</span>
<span class="sd"> ( &#39;lama&#39;, &#39;weight&#39;),</span>
<span class="sd"> ( &#39;lama&#39;, &#39;weight&#39;),</span>
<span class="sd"> ( &#39;cow&#39;, &#39;weight&#39;),</span>
<span class="sd"> ( &#39;cow&#39;, &#39;weight&#39;),</span>
<span class="sd"> ( &#39;cow&#39;, &#39;length&#39;),</span>
<span class="sd"> (&#39;falcon&#39;, &#39;weight&#39;),</span>
<span class="sd"> (&#39;falcon&#39;, &#39;length&#39;),</span>
<span class="sd"> (&#39;falcon&#39;, &#39;length&#39;)],</span>
<span class="sd"> )</span>
<span class="sd"> &gt;&gt;&gt; s.index.value_counts().sort_index()</span>
<span class="sd"> (cow, length) 1</span>
<span class="sd"> (cow, weight) 2</span>
<span class="sd"> (falcon, length) 2</span>
<span class="sd"> (falcon, weight) 1</span>
<span class="sd"> (lama, weight) 3</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; s.index.value_counts(normalize=True).sort_index()</span>
<span class="sd"> (cow, length) 0.111111</span>
<span class="sd"> (cow, weight) 0.222222</span>
<span class="sd"> (falcon, length) 0.222222</span>
<span class="sd"> (falcon, weight) 0.111111</span>
<span class="sd"> (lama, weight) 0.333333</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> If Index has name, keep the name up.</span>
<span class="sd"> &gt;&gt;&gt; idx = ps.Index([0, 0, 0, 1, 1, 2, 3], name=&#39;pandas-on-Spark&#39;)</span>
<span class="sd"> &gt;&gt;&gt; idx.value_counts().sort_index()</span>
<span class="sd"> 0 3</span>
<span class="sd"> 1 2</span>
<span class="sd"> 2 1</span>
<span class="sd"> 3 1</span>
<span class="sd"> Name: pandas-on-Spark, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">first_series</span>
<span class="k">if</span> <span class="n">bins</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;value_counts currently does not support bins&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">dropna</span><span class="p">:</span>
<span class="n">sdf_dropna</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span><span class="o">.</span><span class="n">dropna</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf_dropna</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span>
<span class="n">index_name</span> <span class="o">=</span> <span class="n">SPARK_DEFAULT_INDEX_NAME</span>
<span class="n">column_name</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf_dropna</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf_dropna</span><span class="p">,</span> <span class="n">column_name</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">index_name</span><span class="p">))</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="k">if</span> <span class="n">sort</span><span class="p">:</span>
<span class="k">if</span> <span class="n">ascending</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;count&quot;</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;count&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">desc</span><span class="p">())</span>
<span class="k">if</span> <span class="n">normalize</span><span class="p">:</span>
<span class="n">drop_sum</span> <span class="o">=</span> <span class="n">sdf_dropna</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">&quot;count&quot;</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;count&quot;</span><span class="p">)</span> <span class="o">/</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">drop_sum</span><span class="p">))</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_name</span><span class="p">)],</span>
<span class="n">column_labels</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">,</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">&quot;count&quot;</span><span class="p">)],</span>
<span class="n">column_label_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">nunique</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">approx</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">rsd</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return number of unique elements in the object.</span>
<span class="sd"> Excludes NA values by default.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> dropna : bool, default True</span>
<span class="sd"> Don’t include NaN in the count.</span>
<span class="sd"> approx: bool, default False</span>
<span class="sd"> If False, will use the exact algorithm and return the exact number of unique.</span>
<span class="sd"> If True, it uses the HyperLogLog approximate algorithm, which is significantly faster</span>
<span class="sd"> for large amount of data.</span>
<span class="sd"> Note: This parameter is specific to pandas-on-Spark and is not found in pandas.</span>
<span class="sd"> rsd: float, default 0.05</span>
<span class="sd"> Maximum estimation error allowed in the HyperLogLog algorithm.</span>
<span class="sd"> Note: Just like ``approx`` this parameter is specific to pandas-on-Spark.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> int</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.nunique: Method nunique for DataFrame.</span>
<span class="sd"> Series.count: Count non-NA/null observations in the Series.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1, 2, 3, np.nan]).nunique()</span>
<span class="sd"> 3</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1, 2, 3, np.nan]).nunique(dropna=False)</span>
<span class="sd"> 4</span>
<span class="sd"> On big data, we recommend using the approximate algorithm to speed up this function.</span>
<span class="sd"> The result will be very close to the exact unique count.</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1, 2, 3, np.nan]).nunique(approx=True)</span>
<span class="sd"> 3</span>
<span class="sd"> &gt;&gt;&gt; idx = ps.Index([1, 1, 2, None])</span>
<span class="sd"> &gt;&gt;&gt; idx</span>
<span class="sd"> Float64Index([1.0, 1.0, 2.0, nan], dtype=&#39;float64&#39;)</span>
<span class="sd"> &gt;&gt;&gt; idx.nunique()</span>
<span class="sd"> 2</span>
<span class="sd"> &gt;&gt;&gt; idx.nunique(dropna=False)</span>
<span class="sd"> 3</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">res</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">([</span><span class="bp">self</span><span class="o">.</span><span class="n">_nunique</span><span class="p">(</span><span class="n">dropna</span><span class="p">,</span> <span class="n">approx</span><span class="p">,</span> <span class="n">rsd</span><span class="p">)])</span>
<span class="k">return</span> <span class="n">res</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span>
<span class="k">def</span> <span class="nf">_nunique</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">approx</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">rsd</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">colname</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">count_fn</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span>
<span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span>
<span class="n">partial</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">approx_count_distinct</span><span class="p">,</span> <span class="n">rsd</span><span class="o">=</span><span class="n">rsd</span><span class="p">)</span> <span class="k">if</span> <span class="n">approx</span> <span class="k">else</span> <span class="n">F</span><span class="o">.</span><span class="n">countDistinct</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">dropna</span><span class="p">:</span>
<span class="k">return</span> <span class="n">count_fn</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">colname</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="p">(</span>
<span class="n">count_fn</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span>
<span class="o">+</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="kc">None</span><span class="p">))</span> <span class="o">&gt;=</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span>
<span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
<span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">colname</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">take</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">,</span> <span class="n">indices</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the elements in the given *positional* indices along an axis.</span>
<span class="sd"> This means that we are not indexing according to actual values in</span>
<span class="sd"> the index attribute of the object. We are indexing according to the</span>
<span class="sd"> actual position of the element in the object.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> indices : array-like</span>
<span class="sd"> An array of ints indicating which positions to take.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> taken : same type as caller</span>
<span class="sd"> An array-like containing the elements taken from the object.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.loc : Select a subset of a DataFrame by labels.</span>
<span class="sd"> DataFrame.iloc : Select a subset of a DataFrame by positions.</span>
<span class="sd"> numpy.take : Take elements from an array along an axis.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Series</span>
<span class="sd"> &gt;&gt;&gt; psser = ps.Series([100, 200, 300, 400, 500])</span>
<span class="sd"> &gt;&gt;&gt; psser</span>
<span class="sd"> 0 100</span>
<span class="sd"> 1 200</span>
<span class="sd"> 2 300</span>
<span class="sd"> 3 400</span>
<span class="sd"> 4 500</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; psser.take([0, 2, 4]).sort_index()</span>
<span class="sd"> 0 100</span>
<span class="sd"> 2 300</span>
<span class="sd"> 4 500</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> Index</span>
<span class="sd"> &gt;&gt;&gt; psidx = ps.Index([100, 200, 300, 400, 500])</span>
<span class="sd"> &gt;&gt;&gt; psidx</span>
<span class="sd"> Int64Index([100, 200, 300, 400, 500], dtype=&#39;int64&#39;)</span>
<span class="sd"> &gt;&gt;&gt; psidx.take([0, 2, 4]).sort_values()</span>
<span class="sd"> Int64Index([100, 300, 500], dtype=&#39;int64&#39;)</span>
<span class="sd"> MultiIndex</span>
<span class="sd"> &gt;&gt;&gt; psmidx = ps.MultiIndex.from_tuples([(&quot;x&quot;, &quot;a&quot;), (&quot;x&quot;, &quot;b&quot;), (&quot;x&quot;, &quot;c&quot;)])</span>
<span class="sd"> &gt;&gt;&gt; psmidx # doctest: +SKIP</span>
<span class="sd"> MultiIndex([(&#39;x&#39;, &#39;a&#39;),</span>
<span class="sd"> (&#39;x&#39;, &#39;b&#39;),</span>
<span class="sd"> (&#39;x&#39;, &#39;c&#39;)],</span>
<span class="sd"> )</span>
<span class="sd"> &gt;&gt;&gt; psmidx.take([0, 2]) # doctest: +SKIP</span>
<span class="sd"> MultiIndex([(&#39;x&#39;, &#39;a&#39;),</span>
<span class="sd"> (&#39;x&#39;, &#39;c&#39;)],</span>
<span class="sd"> )</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">indices</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">indices</span><span class="p">,</span> <span class="p">(</span><span class="nb">dict</span><span class="p">,</span> <span class="nb">set</span><span class="p">)):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;`indices` must be a list-like except dict or set&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">IndexOpsLike</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="n">indices</span><span class="p">])</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">IndexOpsLike</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="n">indices</span><span class="p">]</span><span class="o">.</span><span class="n">index</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">factorize</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">,</span> <span class="n">sort</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">na_sentinel</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">IndexOpsLike</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">]:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Encode the object as an enumerated type or categorical variable.</span>
<span class="sd"> This method is useful for obtaining a numeric representation of an</span>
<span class="sd"> array when all that matters is identifying distinct values.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> sort : bool, default True</span>
<span class="sd"> na_sentinel : int or None, default -1</span>
<span class="sd"> Value to mark &quot;not found&quot;. If None, will not drop the NaN</span>
<span class="sd"> from the uniques of the values.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> codes : Series or Index</span>
<span class="sd"> A Series or Index that&#39;s an indexer into `uniques`.</span>
<span class="sd"> ``uniques.take(codes)`` will have the same values as `values`.</span>
<span class="sd"> uniques : pd.Index</span>
<span class="sd"> The unique valid values.</span>
<span class="sd"> .. note ::</span>
<span class="sd"> Even if there&#39;s a missing value in `values`, `uniques` will</span>
<span class="sd"> *not* contain an entry for it.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; psser = ps.Series([&#39;b&#39;, None, &#39;a&#39;, &#39;c&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; codes, uniques = psser.factorize()</span>
<span class="sd"> &gt;&gt;&gt; codes</span>
<span class="sd"> 0 1</span>
<span class="sd"> 1 -1</span>
<span class="sd"> 2 0</span>
<span class="sd"> 3 2</span>
<span class="sd"> 4 1</span>
<span class="sd"> dtype: int32</span>
<span class="sd"> &gt;&gt;&gt; uniques</span>
<span class="sd"> Index([&#39;a&#39;, &#39;b&#39;, &#39;c&#39;], dtype=&#39;object&#39;)</span>
<span class="sd"> &gt;&gt;&gt; codes, uniques = psser.factorize(na_sentinel=None)</span>
<span class="sd"> &gt;&gt;&gt; codes</span>
<span class="sd"> 0 1</span>
<span class="sd"> 1 3</span>
<span class="sd"> 2 0</span>
<span class="sd"> 3 2</span>
<span class="sd"> 4 1</span>
<span class="sd"> dtype: int32</span>
<span class="sd"> &gt;&gt;&gt; uniques</span>
<span class="sd"> Index([&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, None], dtype=&#39;object&#39;)</span>
<span class="sd"> &gt;&gt;&gt; codes, uniques = psser.factorize(na_sentinel=-2)</span>
<span class="sd"> &gt;&gt;&gt; codes</span>
<span class="sd"> 0 1</span>
<span class="sd"> 1 -2</span>
<span class="sd"> 2 0</span>
<span class="sd"> 3 2</span>
<span class="sd"> 4 1</span>
<span class="sd"> dtype: int32</span>
<span class="sd"> &gt;&gt;&gt; uniques</span>
<span class="sd"> Index([&#39;a&#39;, &#39;b&#39;, &#39;c&#39;], dtype=&#39;object&#39;)</span>
<span class="sd"> For Index:</span>
<span class="sd"> &gt;&gt;&gt; psidx = ps.Index([&#39;b&#39;, None, &#39;a&#39;, &#39;c&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; codes, uniques = psidx.factorize()</span>
<span class="sd"> &gt;&gt;&gt; codes</span>
<span class="sd"> Int64Index([1, -1, 0, 2, 1], dtype=&#39;int64&#39;)</span>
<span class="sd"> &gt;&gt;&gt; uniques</span>
<span class="sd"> Index([&#39;a&#39;, &#39;b&#39;, &#39;c&#39;], dtype=&#39;object&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">first_series</span>
<span class="k">assert</span> <span class="p">(</span><span class="n">na_sentinel</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">na_sentinel</span><span class="p">,</span> <span class="nb">int</span><span class="p">)</span>
<span class="k">assert</span> <span class="n">sort</span> <span class="ow">is</span> <span class="kc">True</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n">CategoricalDtype</span><span class="p">):</span>
<span class="n">categories</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">dtype</span><span class="o">.</span><span class="n">categories</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">categories</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">kvs</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span>
<span class="n">chain</span><span class="p">(</span>
<span class="o">*</span><span class="p">[</span>
<span class="p">(</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">code</span><span class="p">),</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">category</span><span class="p">))</span>
<span class="k">for</span> <span class="n">code</span><span class="p">,</span> <span class="n">category</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">categories</span><span class="p">)</span>
<span class="p">]</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="n">map_scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">create_map</span><span class="p">(</span><span class="o">*</span><span class="n">kvs</span><span class="p">)</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">map_scol</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">]</span>
<span class="n">codes</span><span class="p">,</span> <span class="n">uniques</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span>
<span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="p">)</span><span class="o">.</span><span class="n">factorize</span><span class="p">(</span><span class="n">na_sentinel</span><span class="o">=</span><span class="n">na_sentinel</span><span class="p">)</span>
<span class="k">return</span> <span class="n">codes</span><span class="p">,</span> <span class="n">uniques</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span>
<span class="n">uniq_sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span><span class="o">.</span><span class="n">distinct</span><span class="p">()</span>
<span class="c1"># Check number of uniques and constructs sorted `uniques_list`</span>
<span class="n">max_compute_count</span> <span class="o">=</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">&quot;compute.max_rows&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">max_compute_count</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">uniq_pdf</span> <span class="o">=</span> <span class="n">uniq_sdf</span><span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="n">max_compute_count</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">toPandas</span><span class="p">()</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">uniq_pdf</span><span class="p">)</span> <span class="o">&gt;</span> <span class="n">max_compute_count</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Current Series has more then </span><span class="si">{0}</span><span class="s2"> unique values. &quot;</span>
<span class="s2">&quot;Please set &#39;compute.max_rows&#39; by using &#39;pyspark.pandas.config.set_option&#39; &quot;</span>
<span class="s2">&quot;to more than </span><span class="si">{0}</span><span class="s2"> rows. Note that, before changing the &quot;</span>
<span class="s2">&quot;&#39;compute.max_rows&#39;, this operation is considerably expensive.&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">max_compute_count</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">uniq_pdf</span> <span class="o">=</span> <span class="n">uniq_sdf</span><span class="o">.</span><span class="n">toPandas</span><span class="p">()</span>
<span class="c1"># pandas takes both NaN and null in Spark to np.nan, so de-duplication is required</span>
<span class="n">uniq_series</span> <span class="o">=</span> <span class="n">first_series</span><span class="p">(</span><span class="n">uniq_pdf</span><span class="p">)</span><span class="o">.</span><span class="n">drop_duplicates</span><span class="p">()</span>
<span class="n">uniques_list</span> <span class="o">=</span> <span class="n">uniq_series</span><span class="o">.</span><span class="n">tolist</span><span class="p">()</span>
<span class="n">uniques_list</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">uniques_list</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">isna</span><span class="p">(</span><span class="n">x</span><span class="p">),</span> <span class="n">x</span><span class="p">))</span>
<span class="c1"># Constructs `unique_to_code` mapping non-na unique to code</span>
<span class="n">unique_to_code</span> <span class="o">=</span> <span class="p">{}</span>
<span class="k">if</span> <span class="n">na_sentinel</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">na_sentinel_code</span> <span class="o">=</span> <span class="n">na_sentinel</span>
<span class="n">code</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">for</span> <span class="n">unique</span> <span class="ow">in</span> <span class="n">uniques_list</span><span class="p">:</span>
<span class="k">if</span> <span class="n">pd</span><span class="o">.</span><span class="n">isna</span><span class="p">(</span><span class="n">unique</span><span class="p">):</span>
<span class="k">if</span> <span class="n">na_sentinel</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">na_sentinel_code</span> <span class="o">=</span> <span class="n">code</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">unique_to_code</span><span class="p">[</span><span class="n">unique</span><span class="p">]</span> <span class="o">=</span> <span class="n">code</span>
<span class="n">code</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="n">kvs</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span>
<span class="n">chain</span><span class="p">(</span><span class="o">*</span><span class="p">([(</span><span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">unique</span><span class="p">),</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">code</span><span class="p">))</span> <span class="k">for</span> <span class="n">unique</span><span class="p">,</span> <span class="n">code</span> <span class="ow">in</span> <span class="n">unique_to_code</span><span class="o">.</span><span class="n">items</span><span class="p">()]))</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">kvs</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> <span class="c1"># uniques are all missing values</span>
<span class="n">new_scol</span> <span class="o">=</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">na_sentinel_code</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">map_scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">create_map</span><span class="p">(</span><span class="o">*</span><span class="n">kvs</span><span class="p">)</span>
<span class="n">null_scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">isnull</span><span class="p">()</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">na_sentinel_code</span><span class="p">))</span>
<span class="n">new_scol</span> <span class="o">=</span> <span class="n">null_scol</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">map_scol</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">])</span>
<span class="n">codes</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">new_scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]))</span>
<span class="k">if</span> <span class="n">na_sentinel</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="c1"># Drops the NaN from the uniques of the values</span>
<span class="n">uniques_list</span> <span class="o">=</span> <span class="p">[</span><span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">uniques_list</span> <span class="k">if</span> <span class="ow">not</span> <span class="n">pd</span><span class="o">.</span><span class="n">isna</span><span class="p">(</span><span class="n">x</span><span class="p">)]</span>
<span class="n">uniques</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">(</span><span class="n">uniques_list</span><span class="p">)</span>
<span class="k">return</span> <span class="n">codes</span><span class="p">,</span> <span class="n">uniques</span>
<span class="k">def</span> <span class="nf">_test</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">doctest</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span>
<span class="kn">import</span> <span class="nn">pyspark.pandas.base</span>
<span class="n">os</span><span class="o">.</span><span class="n">chdir</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;SPARK_HOME&quot;</span><span class="p">])</span>
<span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">base</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;ps&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span>
<span class="n">spark</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">&quot;local[4]&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">&quot;pyspark.pandas.base tests&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span>
<span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">base</span><span class="p">,</span>
<span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span>
<span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
<span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span>
<span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
<span class="n">_test</span><span class="p">()</span>
</pre></div>
</div>
<div class='prev-next-bottom'>
</div>
</main>
</div>
</div>
<script src="../../../_static/js/index.3da636dd464baa7582d2.js"></script>
<footer class="footer mt-5 mt-md-0">
<div class="container">
<p>
&copy; Copyright .<br/>
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/>
</p>
</div>
</footer>
</body>
</html>