blob: 6bd59f1414083da12bfba47032ee176e097cd9e4 [file] [log] [blame]
<!DOCTYPE html>
<html >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>pyspark.pandas.base &#8212; PySpark 4.0.0-preview2 documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../../../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52" />
<link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" />
<script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/jquery.js"></script>
<script src="../../../_static/underscore.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/clipboard.min.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = '_modules/pyspark/pandas/base';</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/pandas/base.html" />
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a class="skip-link" href="#main-content">Skip to main content</a>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../../../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<nav class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../../../index.html">
<img src="https://spark.apache.org/images/spark-logo.png" class="logo__image only-light" alt="Logo image"/>
<script>document.write(`<img src="https://spark.apache.org/images/spark-logo-rev.svg" class="logo__image only-dark" alt="Logo image"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview2
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/pandas/base.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
</div>
</nav>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar hide-on-wide">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview2
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/pandas/base.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumbs">
<ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb">
<li class="breadcrumb-item breadcrumb-home">
<a href="../../../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="../../index.html" class="nav-link">Module code</a></li>
<li class="breadcrumb-item active" aria-current="page">pyspark.pandas.base</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article" role="main">
<h1>Source code for pyspark.pandas.base</h1><div class="highlight"><pre>
<span></span><span class="c1">#</span>
<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c1"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c1"># this work for additional information regarding copyright ownership.</span>
<span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c1"># (the &quot;License&quot;); you may not use this file except in compliance with</span>
<span class="c1"># the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c1"># distributed under the License is distributed on an &quot;AS IS&quot; BASIS,</span>
<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="c1">#</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd">Base and utility classes for pandas-on-Spark objects.</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="kn">import</span> <span class="nn">warnings</span>
<span class="kn">from</span> <span class="nn">abc</span> <span class="kn">import</span> <span class="n">ABCMeta</span><span class="p">,</span> <span class="n">abstractmethod</span>
<span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">wraps</span><span class="p">,</span> <span class="n">partial</span>
<span class="kn">from</span> <span class="nn">itertools</span> <span class="kn">import</span> <span class="n">chain</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="n">Any</span><span class="p">,</span> <span class="n">Callable</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">Sequence</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">,</span> <span class="n">Union</span><span class="p">,</span> <span class="n">cast</span><span class="p">,</span> <span class="n">TYPE_CHECKING</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<span class="kn">from</span> <span class="nn">pandas.api.types</span> <span class="kn">import</span> <span class="n">is_list_like</span><span class="p">,</span> <span class="n">CategoricalDtype</span> <span class="c1"># type: ignore[attr-defined]</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">F</span><span class="p">,</span> <span class="n">Column</span><span class="p">,</span> <span class="n">Window</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">LongType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">,</span> <span class="n">NumericType</span>
<span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">pandas</span> <span class="k">as</span> <span class="n">ps</span> <span class="c1"># For running doctests and reference resolution in PyCharm.</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas._typing</span> <span class="kn">import</span> <span class="n">Axis</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">,</span> <span class="n">IndexOpsLike</span><span class="p">,</span> <span class="n">Label</span><span class="p">,</span> <span class="n">SeriesOrIndex</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.config</span> <span class="kn">import</span> <span class="n">get_option</span><span class="p">,</span> <span class="n">option_context</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.internal</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">InternalField</span><span class="p">,</span>
<span class="n">InternalFrame</span><span class="p">,</span>
<span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span>
<span class="n">SPARK_DEFAULT_INDEX_NAME</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.spark.accessors</span> <span class="kn">import</span> <span class="n">SparkIndexOpsMethods</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.typedef</span> <span class="kn">import</span> <span class="n">extension_dtypes</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.utils</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">combine_frames</span><span class="p">,</span>
<span class="n">same_anchor</span><span class="p">,</span>
<span class="n">scol_for</span><span class="p">,</span>
<span class="n">validate_axis</span><span class="p">,</span>
<span class="n">ERROR_MESSAGE_CANNOT_COMBINE</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.frame</span> <span class="kn">import</span> <span class="n">DataFrame</span>
<span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.sql._typing</span> <span class="kn">import</span> <span class="n">ColumnOrName</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.data_type_ops.base</span> <span class="kn">import</span> <span class="n">DataTypeOps</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span>
<span class="k">def</span> <span class="nf">should_alignment_for_column_op</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">SeriesOrIndex</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">SeriesOrIndex</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="k">return</span> <span class="ow">not</span> <span class="n">same_anchor</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span> <span class="ow">is</span> <span class="ow">not</span> <span class="n">other</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span>
<span class="k">def</span> <span class="nf">align_diff_index_ops</span><span class="p">(</span>
<span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> <span class="n">this_index_ops</span><span class="p">:</span> <span class="n">SeriesOrIndex</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Align the `IndexOpsMixin` objects and apply the function.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> func : The function to apply</span>
<span class="sd"> this_index_ops : IndexOpsMixin</span>
<span class="sd"> A base `IndexOpsMixin` object</span>
<span class="sd"> args : list of other arguments including other `IndexOpsMixin` objects</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> `Index` if all `this_index_ops` and arguments are `Index`; otherwise `Series`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.indexes</span> <span class="kn">import</span> <span class="n">Index</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span><span class="p">,</span> <span class="n">first_series</span>
<span class="n">cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">arg</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">IndexOpsMixin</span><span class="p">)]</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">this_index_ops</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">cols</span><span class="p">):</span>
<span class="n">combined</span> <span class="o">=</span> <span class="n">combine_frames</span><span class="p">(</span>
<span class="n">this_index_ops</span><span class="o">.</span><span class="n">to_frame</span><span class="p">(),</span>
<span class="o">*</span><span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="n">Series</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">cols</span><span class="p">)],</span>
<span class="n">how</span><span class="o">=</span><span class="s2">&quot;full&quot;</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">column_op</span><span class="p">(</span><span class="n">func</span><span class="p">)(</span>
<span class="n">combined</span><span class="p">[</span><span class="s2">&quot;this&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">combined</span><span class="p">[</span><span class="s2">&quot;this&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">[</span><span class="mi">0</span><span class="p">]),</span>
<span class="o">*</span><span class="p">[</span>
<span class="n">combined</span><span class="p">[</span><span class="s2">&quot;that&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">combined</span><span class="p">[</span><span class="s2">&quot;that&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="p">],</span>
<span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">this_index_ops</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># This could cause as many counts, reset_index calls, joins for combining</span>
<span class="c1"># as the number of `Index`s in `args`. So far it&#39;s fine since we can assume the ops</span>
<span class="c1"># only work between at most two `Index`s. We might need to fix it in the future.</span>
<span class="n">self_len</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">this_index_ops</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="o">!=</span> <span class="n">self_len</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">args</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">IndexOpsMixin</span><span class="p">)):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;operands could not be broadcast together with shapes&quot;</span><span class="p">)</span>
<span class="k">with</span> <span class="n">option_context</span><span class="p">(</span><span class="s2">&quot;compute.default_index_type&quot;</span><span class="p">,</span> <span class="s2">&quot;distributed-sequence&quot;</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">this_index_ops</span><span class="p">,</span> <span class="n">Index</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Index</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">cols</span><span class="p">):</span>
<span class="k">return</span> <span class="n">Index</span><span class="p">(</span>
<span class="n">column_op</span><span class="p">(</span><span class="n">func</span><span class="p">)(</span>
<span class="n">this_index_ops</span><span class="o">.</span><span class="n">to_series</span><span class="p">()</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">),</span>
<span class="o">*</span><span class="p">[</span>
<span class="n">arg</span><span class="o">.</span><span class="n">to_series</span><span class="p">()</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">Index</span><span class="p">)</span>
<span class="k">else</span> <span class="n">arg</span>
<span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span>
<span class="p">],</span>
<span class="p">)</span><span class="o">.</span><span class="n">sort_index</span><span class="p">(),</span>
<span class="n">name</span><span class="o">=</span><span class="n">this_index_ops</span><span class="o">.</span><span class="n">name</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">this_index_ops</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="n">this</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">this_index_ops</span><span class="o">.</span><span class="n">reset_index</span><span class="p">())</span>
<span class="n">that</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">cast</span><span class="p">(</span><span class="n">Series</span><span class="p">,</span> <span class="n">col</span><span class="o">.</span><span class="n">to_series</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Index</span><span class="p">)</span> <span class="k">else</span> <span class="n">col</span><span class="p">)</span>
<span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">i</span><span class="p">)</span>
<span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span>
<span class="p">]</span>
<span class="n">combined</span> <span class="o">=</span> <span class="n">combine_frames</span><span class="p">(</span><span class="n">this</span><span class="p">,</span> <span class="o">*</span><span class="n">that</span><span class="p">,</span> <span class="n">how</span><span class="o">=</span><span class="s2">&quot;full&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">sort_index</span><span class="p">()</span>
<span class="n">combined</span> <span class="o">=</span> <span class="n">combined</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span>
<span class="n">combined</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">[:</span> <span class="n">this_index_ops</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">]</span>
<span class="p">)</span>
<span class="n">combined</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">names</span> <span class="o">=</span> <span class="n">this_index_ops</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span>
<span class="k">return</span> <span class="n">column_op</span><span class="p">(</span><span class="n">func</span><span class="p">)(</span>
<span class="n">first_series</span><span class="p">(</span><span class="n">combined</span><span class="p">[</span><span class="s2">&quot;this&quot;</span><span class="p">]),</span>
<span class="o">*</span><span class="p">[</span>
<span class="n">combined</span><span class="p">[</span><span class="s2">&quot;that&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">combined</span><span class="p">[</span><span class="s2">&quot;that&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="p">],</span>
<span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">this_index_ops</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">this</span> <span class="o">=</span> <span class="n">this_index_ops</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">that_series</span> <span class="o">=</span> <span class="nb">next</span><span class="p">(</span><span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">cols</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Series</span><span class="p">))</span>
<span class="n">that_frame</span> <span class="o">=</span> <span class="n">that_series</span><span class="o">.</span><span class="n">_psdf</span><span class="p">[</span>
<span class="p">[</span>
<span class="n">cast</span><span class="p">(</span><span class="n">Series</span><span class="p">,</span> <span class="n">col</span><span class="o">.</span><span class="n">to_series</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Index</span><span class="p">)</span> <span class="k">else</span> <span class="n">col</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">i</span><span class="p">)</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">cols</span><span class="p">)</span>
<span class="p">]</span>
<span class="p">]</span>
<span class="n">combined</span> <span class="o">=</span> <span class="n">combine_frames</span><span class="p">(</span><span class="n">this</span><span class="p">,</span> <span class="n">that_frame</span><span class="o">.</span><span class="n">reset_index</span><span class="p">())</span><span class="o">.</span><span class="n">sort_index</span><span class="p">()</span>
<span class="n">self_index</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">combined</span><span class="p">[</span><span class="s2">&quot;this&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span><span class="n">combined</span><span class="p">[</span><span class="s2">&quot;this&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span><span class="o">.</span><span class="n">index</span>
<span class="p">)</span>
<span class="n">other</span> <span class="o">=</span> <span class="n">combined</span><span class="p">[</span><span class="s2">&quot;that&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">set_index</span><span class="p">(</span>
<span class="n">combined</span><span class="p">[</span><span class="s2">&quot;that&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">[:</span> <span class="n">that_series</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">]</span>
<span class="p">)</span>
<span class="n">other</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">names</span> <span class="o">=</span> <span class="n">that_series</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span>
<span class="k">return</span> <span class="n">column_op</span><span class="p">(</span><span class="n">func</span><span class="p">)(</span>
<span class="n">self_index</span><span class="p">,</span>
<span class="o">*</span><span class="p">[</span>
<span class="n">other</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">col</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">other</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">,</span> <span class="n">cols</span><span class="p">)</span>
<span class="p">],</span>
<span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">that_series</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">booleanize_null</span><span class="p">(</span><span class="n">scol</span><span class="p">:</span> <span class="n">Column</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Booleanize Null in Spark Column</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">comp_ops</span> <span class="o">=</span> <span class="p">[</span>
<span class="nb">getattr</span><span class="p">(</span><span class="n">Column</span><span class="p">,</span> <span class="s2">&quot;__</span><span class="si">{}</span><span class="s2">__&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">comp_op</span><span class="p">))</span>
<span class="k">for</span> <span class="n">comp_op</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;eq&quot;</span><span class="p">,</span> <span class="s2">&quot;ne&quot;</span><span class="p">,</span> <span class="s2">&quot;lt&quot;</span><span class="p">,</span> <span class="s2">&quot;le&quot;</span><span class="p">,</span> <span class="s2">&quot;ge&quot;</span><span class="p">,</span> <span class="s2">&quot;gt&quot;</span><span class="p">]</span>
<span class="p">]</span>
<span class="k">if</span> <span class="n">f</span> <span class="ow">in</span> <span class="n">comp_ops</span><span class="p">:</span>
<span class="c1"># if `f` is &quot;!=&quot;, fill null with True otherwise False</span>
<span class="n">filler</span> <span class="o">=</span> <span class="n">f</span> <span class="o">==</span> <span class="n">Column</span><span class="o">.</span><span class="fm">__ne__</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="n">filler</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span>
<span class="k">return</span> <span class="n">scol</span>
<span class="k">def</span> <span class="nf">column_op</span><span class="p">(</span><span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">SeriesOrIndex</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A decorator that wraps APIs taking/returning Spark Column so that pandas-on-Spark Series can be</span>
<span class="sd"> supported too. If this decorator is used for the `f` function that takes Spark Column and</span>
<span class="sd"> returns Spark Column, decorated `f` takes pandas-on-Spark Series as well and returns</span>
<span class="sd"> pandas-on-Spark Series.</span>
<span class="sd"> :param f: a function that takes Spark Column and returns Spark Column.</span>
<span class="sd"> :param self: pandas-on-Spark Series</span>
<span class="sd"> :param args: arguments that the function `f` takes.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@wraps</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">wrapper</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">SeriesOrIndex</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.indexes.base</span> <span class="kn">import</span> <span class="n">Index</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span>
<span class="c1"># It is possible for the function `f` to take other arguments than Spark Column.</span>
<span class="c1"># To cover this case, explicitly check if the argument is pandas-on-Spark Series and</span>
<span class="c1"># extract Spark Column. For other arguments, they are used as are.</span>
<span class="n">cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">arg</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="p">(</span><span class="n">Series</span><span class="p">,</span> <span class="n">Index</span><span class="p">))]</span>
<span class="k">if</span> <span class="nb">all</span><span class="p">(</span><span class="ow">not</span> <span class="n">should_alignment_for_column_op</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">cols</span><span class="p">):</span>
<span class="c1"># Same DataFrame anchors</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">f</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span>
<span class="o">*</span><span class="p">[</span><span class="n">arg</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">IndexOpsMixin</span><span class="p">)</span> <span class="k">else</span> <span class="n">arg</span> <span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span><span class="p">],</span>
<span class="p">)</span>
<span class="n">field</span> <span class="o">=</span> <span class="n">InternalField</span><span class="o">.</span><span class="n">from_struct_field</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span><span class="o">.</span><span class="n">schema</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span>
<span class="n">use_extension_dtypes</span><span class="o">=</span><span class="nb">any</span><span class="p">(</span>
<span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n">extension_dtypes</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="p">[</span><span class="bp">self</span><span class="p">]</span> <span class="o">+</span> <span class="n">cols</span>
<span class="p">),</span>
<span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">field</span><span class="o">.</span><span class="n">is_extension_dtype</span><span class="p">:</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">booleanize_null</span><span class="p">(</span><span class="n">scol</span><span class="p">,</span> <span class="n">f</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">field</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="ow">or</span> <span class="ow">not</span> <span class="nb">any</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">cols</span><span class="p">):</span>
<span class="n">index_ops</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">scol</span><span class="p">,</span> <span class="n">field</span><span class="o">=</span><span class="n">field</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">psser</span> <span class="o">=</span> <span class="nb">next</span><span class="p">(</span><span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">cols</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">Series</span><span class="p">))</span>
<span class="n">index_ops</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">scol</span><span class="p">,</span> <span class="n">field</span><span class="o">=</span><span class="n">field</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">&quot;compute.ops_on_diff_frames&quot;</span><span class="p">):</span>
<span class="n">index_ops</span> <span class="o">=</span> <span class="n">align_diff_index_ops</span><span class="p">(</span><span class="n">f</span><span class="p">,</span> <span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">ERROR_MESSAGE_CANNOT_COMBINE</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span> <span class="o">==</span> <span class="n">col</span><span class="o">.</span><span class="n">name</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">cols</span><span class="p">):</span>
<span class="n">index_ops</span> <span class="o">=</span> <span class="n">index_ops</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span>
<span class="k">return</span> <span class="n">index_ops</span>
<span class="k">return</span> <span class="n">wrapper</span>
<span class="k">def</span> <span class="nf">numpy_column_op</span><span class="p">(</span><span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">SeriesOrIndex</span><span class="p">]:</span>
<span class="nd">@wraps</span><span class="p">(</span><span class="n">f</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">wrapper</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">SeriesOrIndex</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="c1"># PySpark does not support NumPy type out of the box. For now, we convert NumPy types</span>
<span class="c1"># into some primitive types understandable in PySpark.</span>
<span class="n">new_args</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">arg</span> <span class="ow">in</span> <span class="n">args</span><span class="p">:</span>
<span class="c1"># TODO: This is a quick hack to support NumPy type. We should revisit this.</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">LongType</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">arg</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">timedelta64</span><span class="p">):</span>
<span class="n">new_args</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="nb">float</span><span class="p">(</span><span class="n">arg</span> <span class="o">/</span> <span class="n">np</span><span class="o">.</span><span class="n">timedelta64</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="s2">&quot;s&quot;</span><span class="p">)))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">new_args</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">arg</span><span class="p">)</span>
<span class="k">return</span> <span class="n">column_op</span><span class="p">(</span><span class="n">f</span><span class="p">)(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">new_args</span><span class="p">)</span>
<span class="k">return</span> <span class="n">wrapper</span>
<span class="k">class</span> <span class="nc">IndexOpsMixin</span><span class="p">(</span><span class="nb">object</span><span class="p">,</span> <span class="n">metaclass</span><span class="o">=</span><span class="n">ABCMeta</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;common ops mixin to support a unified interface / docs for Series / Index</span>
<span class="sd"> Assuming there are following attributes or properties and functions.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@property</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_internal</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">InternalFrame</span><span class="p">:</span>
<span class="k">pass</span>
<span class="nd">@property</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_psdf</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="k">pass</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_with_new_scol</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">,</span> <span class="n">scol</span><span class="p">:</span> <span class="n">Column</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">field</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">InternalField</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="k">pass</span>
<span class="nd">@property</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_column_label</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Label</span><span class="p">]:</span>
<span class="k">pass</span>
<span class="nd">@property</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">spark</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SparkIndexOpsMethods</span><span class="p">[</span><span class="n">IndexOpsLike</span><span class="p">]:</span>
<span class="k">pass</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">_dtype_op</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataTypeOps&quot;</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.data_type_ops.base</span> <span class="kn">import</span> <span class="n">DataTypeOps</span>
<span class="k">return</span> <span class="n">DataTypeOps</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">)</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">copy</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="k">pass</span>
<span class="c1"># arithmetic operators</span>
<span class="k">def</span> <span class="fm">__neg__</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">neg</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__add__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__sub__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">sub</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__mul__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">mul</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__truediv__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __truediv__ has different behaviour between pandas and PySpark for several cases.</span>
<span class="sd"> 1. When dividing np.inf by zero, PySpark returns null whereas pandas returns np.inf</span>
<span class="sd"> 2. When dividing a positive number by zero, PySpark returns null</span>
<span class="sd"> whereas pandas returns np.inf</span>
<span class="sd"> 3. When divide -np.inf by zero, PySpark returns null whereas pandas returns -np.inf</span>
<span class="sd"> 4. When divide negative number by zero, PySpark returns null whereas pandas returns -np.inf</span>
<span class="sd"> +-------------------------------------------+</span>
<span class="sd"> | dividend (divisor: 0) | PySpark | pandas |</span>
<span class="sd"> |-----------------------|---------|---------|</span>
<span class="sd"> | np.inf | null | np.inf |</span>
<span class="sd"> | -np.inf | null | -np.inf |</span>
<span class="sd"> | 10 | null | np.inf |</span>
<span class="sd"> | -10 | null | -np.inf |</span>
<span class="sd"> +-----------------------|---------|---------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">truediv</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__mod__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">mod</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__radd__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">radd</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__rsub__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">rsub</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__rmul__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">rmul</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__rtruediv__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">rtruediv</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__floordiv__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> __floordiv__ has different behaviour between pandas and PySpark for several cases.</span>
<span class="sd"> 1. When dividing np.inf by zero, PySpark returns null whereas pandas returns np.inf</span>
<span class="sd"> 2. When dividing a positive number by zero, PySpark returns null</span>
<span class="sd"> whereas pandas returns np.inf</span>
<span class="sd"> 3. When divide -np.inf by zero, PySpark returns null whereas pandas returns -np.inf</span>
<span class="sd"> 4. When divide negative number by zero, PySpark returns null whereas pandas returns -np.inf</span>
<span class="sd"> +-------------------------------------------+</span>
<span class="sd"> | dividend (divisor: 0) | PySpark | pandas |</span>
<span class="sd"> |-----------------------|---------|---------|</span>
<span class="sd"> | np.inf | null | np.inf |</span>
<span class="sd"> | -np.inf | null | -np.inf |</span>
<span class="sd"> | 10 | null | np.inf |</span>
<span class="sd"> | -10 | null | -np.inf |</span>
<span class="sd"> +-----------------------|---------|---------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">floordiv</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__rfloordiv__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">rfloordiv</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__rmod__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">rmod</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__pow__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">pow</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__rpow__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">rpow</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__abs__</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">abs</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="c1"># comparison operators</span>
<span class="k">def</span> <span class="fm">__eq__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span> <span class="c1"># type: ignore[override]</span>
<span class="c1"># pandas always returns False for all items with dict and set.</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">other</span><span class="p">,</span> <span class="p">(</span><span class="nb">dict</span><span class="p">,</span> <span class="nb">set</span><span class="p">)):</span>
<span class="k">return</span> <span class="bp">self</span> <span class="o">!=</span> <span class="bp">self</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">eq</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__ne__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span> <span class="c1"># type: ignore[override]</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">ne</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__lt__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">lt</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__le__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">le</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__ge__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">ge</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__gt__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">gt</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__invert__</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">invert</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="c1"># `and`, `or`, `not` cannot be overloaded in Python,</span>
<span class="c1"># so use bitwise operators as boolean operators</span>
<span class="k">def</span> <span class="fm">__and__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="fm">__and__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__or__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="fm">__or__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__rand__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__ror__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">ror</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__xor__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">xor</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__rxor__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">rxor</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__len__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="k">return</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">)</span>
<span class="c1"># NDArray Compat</span>
<span class="k">def</span> <span class="nf">__array_ufunc__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">ufunc</span><span class="p">:</span> <span class="n">Callable</span><span class="p">,</span> <span class="n">method</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="o">*</span><span class="n">inputs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SeriesOrIndex</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas</span> <span class="kn">import</span> <span class="n">numpy_compat</span>
<span class="c1"># Try dunder methods first.</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">numpy_compat</span><span class="o">.</span><span class="n">maybe_dispatch_ufunc_to_dunder_op</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">ufunc</span><span class="p">,</span> <span class="n">method</span><span class="p">,</span> <span class="o">*</span><span class="n">inputs</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span>
<span class="p">)</span>
<span class="c1"># After that, we try with PySpark APIs.</span>
<span class="k">if</span> <span class="n">result</span> <span class="ow">is</span> <span class="bp">NotImplemented</span><span class="p">:</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">numpy_compat</span><span class="o">.</span><span class="n">maybe_dispatch_ufunc_to_spark_func</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">ufunc</span><span class="p">,</span> <span class="n">method</span><span class="p">,</span> <span class="o">*</span><span class="n">inputs</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">result</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">NotImplemented</span><span class="p">:</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">SeriesOrIndex</span><span class="p">,</span> <span class="n">result</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># TODO: support more APIs?</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span>
<span class="s2">&quot;pandas-on-Spark objects currently do not support </span><span class="si">%s</span><span class="s2">.&quot;</span> <span class="o">%</span> <span class="n">ufunc</span>
<span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">dtype</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Dtype</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Return the dtype object of the underlying data.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([1, 2, 3])</span>
<span class="sd"> &gt;&gt;&gt; s.dtype</span>
<span class="sd"> dtype(&#39;int64&#39;)</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series(list(&#39;abc&#39;))</span>
<span class="sd"> &gt;&gt;&gt; s.dtype</span>
<span class="sd"> dtype(&#39;O&#39;)</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series(pd.date_range(&#39;20130101&#39;, periods=3))</span>
<span class="sd"> &gt;&gt;&gt; s.dtype</span>
<span class="sd"> dtype(&#39;&lt;M8[ns]&#39;)</span>
<span class="sd"> &gt;&gt;&gt; s.rename(&quot;a&quot;).to_frame().set_index(&quot;a&quot;).index.dtype</span>
<span class="sd"> dtype(&#39;&lt;M8[ns]&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">dtype</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">empty</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns true if the current object is empty. Otherwise, it returns false.</span>
<span class="sd"> &gt;&gt;&gt; ps.range(10).id.empty</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.range(0).id.empty</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.DataFrame({}, index=list(&#39;abc&#39;)).index.empty</span>
<span class="sd"> False</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">isEmpty</span><span class="p">()</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">hasnans</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return True if it has any missing values. Otherwise, it returns False.</span>
<span class="sd"> &gt;&gt;&gt; ps.DataFrame({}, index=list(&#39;abc&#39;)).index.hasnans</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([&#39;a&#39;, None]).hasnans</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1.0, 2.0, np.nan]).hasnans</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1, 2, 3]).hasnans</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; (ps.Series([1.0, 2.0, np.nan]) + 1).hasnans</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1, 2, 3]).rename(&quot;a&quot;).to_frame().set_index(&quot;a&quot;).index.hasnans</span>
<span class="sd"> False</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">isnull</span><span class="p">()</span><span class="o">.</span><span class="n">any</span><span class="p">()</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">is_monotonic_increasing</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return boolean if values in the object are monotonically increasing.</span>
<span class="sd"> .. note:: the current implementation of is_monotonic_increasing requires to shuffle</span>
<span class="sd"> and aggregate multiple times to check the order locally and globally,</span>
<span class="sd"> which is potentially expensive. In case of multi-index, all data is</span>
<span class="sd"> transferred to a single node which can easily cause out-of-memory errors.</span>
<span class="sd"> .. note:: Disable the Spark config `spark.sql.optimizer.nestedSchemaPruning.enabled`</span>
<span class="sd"> for multi-index if you&#39;re using pandas-on-Spark &lt; 1.7.0 with PySpark 3.1.1.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> is_monotonic : bool</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([&#39;1/1/2018&#39;, &#39;3/1/2018&#39;, &#39;4/1/2018&#39;])</span>
<span class="sd"> &gt;&gt;&gt; ser.is_monotonic_increasing</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;dates&#39;: [None, &#39;1/1/2018&#39;, &#39;2/1/2018&#39;, &#39;3/1/2018&#39;]})</span>
<span class="sd"> &gt;&gt;&gt; df.dates.is_monotonic_increasing</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; df.index.is_monotonic_increasing</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([1])</span>
<span class="sd"> &gt;&gt;&gt; ser.is_monotonic_increasing</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([])</span>
<span class="sd"> &gt;&gt;&gt; ser.is_monotonic_increasing</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ser.rename(&quot;a&quot;).to_frame().set_index(&quot;a&quot;).index.is_monotonic_increasing</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([5, 4, 3, 2, 1], index=[1, 2, 3, 4, 5])</span>
<span class="sd"> &gt;&gt;&gt; ser.is_monotonic_increasing</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ser.index.is_monotonic_increasing</span>
<span class="sd"> True</span>
<span class="sd"> Support for MultiIndex</span>
<span class="sd"> &gt;&gt;&gt; midx = ps.MultiIndex.from_tuples(</span>
<span class="sd"> ... [(&#39;x&#39;, &#39;a&#39;), (&#39;x&#39;, &#39;b&#39;), (&#39;y&#39;, &#39;c&#39;), (&#39;y&#39;, &#39;d&#39;), (&#39;z&#39;, &#39;e&#39;)])</span>
<span class="sd"> &gt;&gt;&gt; midx # doctest: +SKIP</span>
<span class="sd"> MultiIndex([(&#39;x&#39;, &#39;a&#39;),</span>
<span class="sd"> (&#39;x&#39;, &#39;b&#39;),</span>
<span class="sd"> (&#39;y&#39;, &#39;c&#39;),</span>
<span class="sd"> (&#39;y&#39;, &#39;d&#39;),</span>
<span class="sd"> (&#39;z&#39;, &#39;e&#39;)],</span>
<span class="sd"> )</span>
<span class="sd"> &gt;&gt;&gt; midx.is_monotonic_increasing</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; midx = ps.MultiIndex.from_tuples(</span>
<span class="sd"> ... [(&#39;z&#39;, &#39;a&#39;), (&#39;z&#39;, &#39;b&#39;), (&#39;y&#39;, &#39;c&#39;), (&#39;y&#39;, &#39;d&#39;), (&#39;x&#39;, &#39;e&#39;)])</span>
<span class="sd"> &gt;&gt;&gt; midx # doctest: +SKIP</span>
<span class="sd"> MultiIndex([(&#39;z&#39;, &#39;a&#39;),</span>
<span class="sd"> (&#39;z&#39;, &#39;b&#39;),</span>
<span class="sd"> (&#39;y&#39;, &#39;c&#39;),</span>
<span class="sd"> (&#39;y&#39;, &#39;d&#39;),</span>
<span class="sd"> (&#39;x&#39;, &#39;e&#39;)],</span>
<span class="sd"> )</span>
<span class="sd"> &gt;&gt;&gt; midx.is_monotonic_increasing</span>
<span class="sd"> False</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_is_monotonic</span><span class="p">(</span><span class="s2">&quot;increasing&quot;</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">is_monotonic_decreasing</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return boolean if values in the object are monotonically decreasing.</span>
<span class="sd"> .. note:: the current implementation of is_monotonic_decreasing requires to shuffle</span>
<span class="sd"> and aggregate multiple times to check the order locally and globally,</span>
<span class="sd"> which is potentially expensive. In case of multi-index, all data is transferred</span>
<span class="sd"> to a single node which can easily cause out-of-memory errors.</span>
<span class="sd"> .. note:: Disable the Spark config `spark.sql.optimizer.nestedSchemaPruning.enabled`</span>
<span class="sd"> for multi-index if you&#39;re using pandas-on-Spark &lt; 1.7.0 with PySpark 3.1.1.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> is_monotonic : bool</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([&#39;4/1/2018&#39;, &#39;3/1/2018&#39;, &#39;1/1/2018&#39;])</span>
<span class="sd"> &gt;&gt;&gt; ser.is_monotonic_decreasing</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;dates&#39;: [None, &#39;3/1/2018&#39;, &#39;2/1/2018&#39;, &#39;1/1/2018&#39;]})</span>
<span class="sd"> &gt;&gt;&gt; df.dates.is_monotonic_decreasing</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; df.index.is_monotonic_decreasing</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([1])</span>
<span class="sd"> &gt;&gt;&gt; ser.is_monotonic_decreasing</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([])</span>
<span class="sd"> &gt;&gt;&gt; ser.is_monotonic_decreasing</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ser.rename(&quot;a&quot;).to_frame().set_index(&quot;a&quot;).index.is_monotonic_decreasing</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([5, 4, 3, 2, 1], index=[1, 2, 3, 4, 5])</span>
<span class="sd"> &gt;&gt;&gt; ser.is_monotonic_decreasing</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ser.index.is_monotonic_decreasing</span>
<span class="sd"> False</span>
<span class="sd"> Support for MultiIndex</span>
<span class="sd"> &gt;&gt;&gt; midx = ps.MultiIndex.from_tuples(</span>
<span class="sd"> ... [(&#39;x&#39;, &#39;a&#39;), (&#39;x&#39;, &#39;b&#39;), (&#39;y&#39;, &#39;c&#39;), (&#39;y&#39;, &#39;d&#39;), (&#39;z&#39;, &#39;e&#39;)])</span>
<span class="sd"> &gt;&gt;&gt; midx # doctest: +SKIP</span>
<span class="sd"> MultiIndex([(&#39;x&#39;, &#39;a&#39;),</span>
<span class="sd"> (&#39;x&#39;, &#39;b&#39;),</span>
<span class="sd"> (&#39;y&#39;, &#39;c&#39;),</span>
<span class="sd"> (&#39;y&#39;, &#39;d&#39;),</span>
<span class="sd"> (&#39;z&#39;, &#39;e&#39;)],</span>
<span class="sd"> )</span>
<span class="sd"> &gt;&gt;&gt; midx.is_monotonic_decreasing</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; midx = ps.MultiIndex.from_tuples(</span>
<span class="sd"> ... [(&#39;z&#39;, &#39;e&#39;), (&#39;z&#39;, &#39;d&#39;), (&#39;y&#39;, &#39;c&#39;), (&#39;y&#39;, &#39;b&#39;), (&#39;x&#39;, &#39;a&#39;)])</span>
<span class="sd"> &gt;&gt;&gt; midx # doctest: +SKIP</span>
<span class="sd"> MultiIndex([(&#39;z&#39;, &#39;a&#39;),</span>
<span class="sd"> (&#39;z&#39;, &#39;b&#39;),</span>
<span class="sd"> (&#39;y&#39;, &#39;c&#39;),</span>
<span class="sd"> (&#39;y&#39;, &#39;d&#39;),</span>
<span class="sd"> (&#39;x&#39;, &#39;e&#39;)],</span>
<span class="sd"> )</span>
<span class="sd"> &gt;&gt;&gt; midx.is_monotonic_decreasing</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_is_monotonic</span><span class="p">(</span><span class="s2">&quot;decreasing&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_is_locally_monotonic_spark_column</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">order</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">window</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__partition_id&quot;</span><span class="p">))</span>
<span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span>
<span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">order</span> <span class="o">==</span> <span class="s2">&quot;increasing&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__origin&quot;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">F</span><span class="o">.</span><span class="n">lag</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__origin&quot;</span><span class="p">),</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">))</span> <span class="o">&amp;</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span>
<span class="s2">&quot;__origin&quot;</span>
<span class="p">)</span><span class="o">.</span><span class="n">isNotNull</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__origin&quot;</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="n">F</span><span class="o">.</span><span class="n">lag</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__origin&quot;</span><span class="p">),</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">))</span> <span class="o">&amp;</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span>
<span class="s2">&quot;__origin&quot;</span>
<span class="p">)</span><span class="o">.</span><span class="n">isNotNull</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">_is_monotonic</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">order</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="k">assert</span> <span class="n">order</span> <span class="ow">in</span> <span class="p">(</span><span class="s2">&quot;increasing&quot;</span><span class="p">,</span> <span class="s2">&quot;decreasing&quot;</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">spark_partition_id</span><span class="p">()</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span>
<span class="s2">&quot;__partition_id&quot;</span>
<span class="p">),</span> <span class="c1"># Make sure we use the same partition id in the whole job.</span>
<span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">),</span>
<span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;__origin&quot;</span><span class="p">),</span>
<span class="p">)</span>
<span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__partition_id&quot;</span><span class="p">),</span>
<span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__origin&quot;</span><span class="p">),</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_is_locally_monotonic_spark_column</span><span class="p">(</span><span class="n">order</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span>
<span class="s2">&quot;__comparison_within_partition&quot;</span>
<span class="p">),</span>
<span class="p">)</span>
<span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__partition_id&quot;</span><span class="p">))</span>
<span class="o">.</span><span class="n">agg</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__origin&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;__partition_min&quot;</span><span class="p">),</span>
<span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__origin&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;__partition_max&quot;</span><span class="p">),</span>
<span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__comparison_within_partition&quot;</span><span class="p">),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">True</span><span class="p">)))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span>
<span class="s2">&quot;__comparison_within_partition&quot;</span>
<span class="p">),</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="c1"># Now we&#39;re windowing the aggregation results without partition specification.</span>
<span class="c1"># The number of rows here will be the same as partitions, which is expected</span>
<span class="c1"># to be small.</span>
<span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__partition_id&quot;</span><span class="p">))</span><span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span>
<span class="k">if</span> <span class="n">order</span> <span class="o">==</span> <span class="s2">&quot;increasing&quot;</span><span class="p">:</span>
<span class="n">comparison_col</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__partition_min&quot;</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">F</span><span class="o">.</span><span class="n">lag</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__partition_max&quot;</span><span class="p">),</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">over</span><span class="p">(</span>
<span class="n">window</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">comparison_col</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__partition_min&quot;</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="n">F</span><span class="o">.</span><span class="n">lag</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__partition_max&quot;</span><span class="p">),</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">over</span><span class="p">(</span>
<span class="n">window</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="n">comparison_col</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="s2">&quot;__comparison_between_partitions&quot;</span><span class="p">),</span>
<span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__comparison_within_partition&quot;</span><span class="p">),</span>
<span class="p">)</span>
<span class="n">ret</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__comparison_between_partitions&quot;</span><span class="p">),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">True</span><span class="p">)))</span>
<span class="o">&amp;</span> <span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;__comparison_within_partition&quot;</span><span class="p">),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">True</span><span class="p">)))</span>
<span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span>
<span class="k">if</span> <span class="n">ret</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">True</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">ret</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">ndim</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return an int representing the number of array dimensions.</span>
<span class="sd"> Return 1 for Series / Index / MultiIndex.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> For Series</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([None, 1, 2, 3, 4], index=[4, 5, 2, 1, 8])</span>
<span class="sd"> &gt;&gt;&gt; s.ndim</span>
<span class="sd"> 1</span>
<span class="sd"> For Index</span>
<span class="sd"> &gt;&gt;&gt; s.index.ndim</span>
<span class="sd"> 1</span>
<span class="sd"> For MultiIndex</span>
<span class="sd"> &gt;&gt;&gt; midx = pd.MultiIndex([[&#39;lama&#39;, &#39;cow&#39;, &#39;falcon&#39;],</span>
<span class="sd"> ... [&#39;speed&#39;, &#39;weight&#39;, &#39;length&#39;]],</span>
<span class="sd"> ... [[0, 0, 0, 1, 1, 1, 2, 2, 2],</span>
<span class="sd"> ... [1, 1, 1, 1, 1, 2, 1, 2, 2]])</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)</span>
<span class="sd"> &gt;&gt;&gt; s.index.ndim</span>
<span class="sd"> 1</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="mi">1</span>
<span class="k">def</span> <span class="nf">astype</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">,</span> <span class="n">dtype</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">type</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Cast a pandas-on-Spark object to a specified dtype ``dtype``.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> dtype : data type</span>
<span class="sd"> Use a numpy.dtype or Python type to cast entire pandas object to</span>
<span class="sd"> the same type.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> casted : same type as caller</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> to_datetime : Convert argument to datetime.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([1, 2], dtype=&#39;int32&#39;)</span>
<span class="sd"> &gt;&gt;&gt; ser</span>
<span class="sd"> 0 1</span>
<span class="sd"> 1 2</span>
<span class="sd"> dtype: int32</span>
<span class="sd"> &gt;&gt;&gt; ser.astype(&#39;int64&#39;)</span>
<span class="sd"> 0 1</span>
<span class="sd"> 1 2</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; ser.rename(&quot;a&quot;).to_frame().set_index(&quot;a&quot;).index.astype(&#39;int64&#39;)</span>
<span class="sd"> Index([1, 2], dtype=&#39;int64&#39;, name=&#39;a&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dtype</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">isin</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">,</span> <span class="n">values</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Any</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Check whether `values` are contained in Series or Index.</span>
<span class="sd"> Return a boolean Series or Index showing whether each element in the Series</span>
<span class="sd"> matches an element in the passed sequence of `values` exactly.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> values : set or list-like</span>
<span class="sd"> The sequence of values to test.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> isin : Series (bool dtype) or Index (bool dtype)</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([&#39;lama&#39;, &#39;cow&#39;, &#39;lama&#39;, &#39;beetle&#39;, &#39;lama&#39;,</span>
<span class="sd"> ... &#39;hippo&#39;], name=&#39;animal&#39;)</span>
<span class="sd"> &gt;&gt;&gt; s.isin([&#39;cow&#39;, &#39;lama&#39;])</span>
<span class="sd"> 0 True</span>
<span class="sd"> 1 True</span>
<span class="sd"> 2 True</span>
<span class="sd"> 3 False</span>
<span class="sd"> 4 True</span>
<span class="sd"> 5 False</span>
<span class="sd"> Name: animal, dtype: bool</span>
<span class="sd"> Passing a single string as ``s.isin(&#39;lama&#39;)`` will raise an error. Use</span>
<span class="sd"> a list of one element instead:</span>
<span class="sd"> &gt;&gt;&gt; s.isin([&#39;lama&#39;])</span>
<span class="sd"> 0 True</span>
<span class="sd"> 1 False</span>
<span class="sd"> 2 True</span>
<span class="sd"> 3 False</span>
<span class="sd"> 4 True</span>
<span class="sd"> 5 False</span>
<span class="sd"> Name: animal, dtype: bool</span>
<span class="sd"> &gt;&gt;&gt; s.rename(&quot;a&quot;).to_frame().set_index(&quot;a&quot;).index.isin([&#39;lama&#39;]) # doctest: +SKIP</span>
<span class="sd"> Index([True, False, True, False, True, False], dtype=&#39;bool&#39;, name=&#39;a&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">values</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;only list-like objects are allowed to be passed&quot;</span>
<span class="s2">&quot; to isin(), you passed a [</span><span class="si">{values_type}</span><span class="s2">]&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">values_type</span><span class="o">=</span><span class="nb">type</span><span class="p">(</span><span class="n">values</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="p">)</span>
<span class="n">values</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">cast</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="n">values</span><span class="p">)</span><span class="o">.</span><span class="n">tolist</span><span class="p">()</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">values</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">)</span> <span class="k">else</span> <span class="nb">list</span><span class="p">(</span><span class="n">values</span><span class="p">)</span>
<span class="p">)</span>
<span class="n">other</span> <span class="o">=</span> <span class="p">[</span><span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">v</span><span class="p">)</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">values</span><span class="p">]</span>
<span class="n">scol</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">isin</span><span class="p">(</span><span class="n">other</span><span class="p">)</span>
<span class="n">field</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span>
<span class="n">dtype</span><span class="o">=</span><span class="n">np</span><span class="o">.</span><span class="n">dtype</span><span class="p">(</span><span class="s2">&quot;bool&quot;</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">=</span><span class="n">BooleanType</span><span class="p">(),</span> <span class="n">nullable</span><span class="o">=</span><span class="kc">False</span>
<span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">scol</span><span class="o">=</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">scol</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">)),</span> <span class="n">field</span><span class="o">=</span><span class="n">field</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">isnull</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Detect existing (non-missing) values.</span>
<span class="sd"> Return a boolean same-sized object indicating if the values are NA.</span>
<span class="sd"> NA values, such as None or numpy.NaN, get mapped to True values.</span>
<span class="sd"> Everything else gets mapped to False values. Characters such as empty strings &#39;&#39; or</span>
<span class="sd"> numpy.inf are not considered NA values</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or Index : Mask of bool values for each element in Series</span>
<span class="sd"> that indicates whether an element is not an NA value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([5, 6, np.nan])</span>
<span class="sd"> &gt;&gt;&gt; ser.isna() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> 0 False</span>
<span class="sd"> 1 False</span>
<span class="sd"> 2 True</span>
<span class="sd"> dtype: bool</span>
<span class="sd"> &gt;&gt;&gt; ser.rename(&quot;a&quot;).to_frame().set_index(&quot;a&quot;).index.isna() # doctest: +SKIP</span>
<span class="sd"> Index([False, False, True], dtype=&#39;bool&#39;, name=&#39;a&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.indexes</span> <span class="kn">import</span> <span class="n">MultiIndex</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">MultiIndex</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;isna is not defined for MultiIndex&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">isnull</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="n">isna</span> <span class="o">=</span> <span class="n">isnull</span>
<span class="k">def</span> <span class="nf">notnull</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Detect existing (non-missing) values.</span>
<span class="sd"> Return a boolean same-sized object indicating if the values are not NA.</span>
<span class="sd"> Non-missing values get mapped to True.</span>
<span class="sd"> Characters such as empty strings &#39;&#39; or numpy.inf are not considered NA values</span>
<span class="sd"> NA values, such as None or numpy.NaN, get mapped to False values.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or Index : Mask of bool values for each element in Series</span>
<span class="sd"> that indicates whether an element is not an NA value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Show which entries in a Series are not NA.</span>
<span class="sd"> &gt;&gt;&gt; ser = ps.Series([5, 6, np.nan])</span>
<span class="sd"> &gt;&gt;&gt; ser</span>
<span class="sd"> 0 5.0</span>
<span class="sd"> 1 6.0</span>
<span class="sd"> 2 NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; ser.notna()</span>
<span class="sd"> 0 True</span>
<span class="sd"> 1 True</span>
<span class="sd"> 2 False</span>
<span class="sd"> dtype: bool</span>
<span class="sd"> &gt;&gt;&gt; ser.rename(&quot;a&quot;).to_frame().set_index(&quot;a&quot;).index.notna() # doctest: +SKIP</span>
<span class="sd"> Index([True, True, False], dtype=&#39;bool&#39;, name=&#39;a&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.indexes</span> <span class="kn">import</span> <span class="n">MultiIndex</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">MultiIndex</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;notna is not defined for MultiIndex&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="p">(</span><span class="o">~</span><span class="bp">self</span><span class="o">.</span><span class="n">isnull</span><span class="p">())</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> <span class="c1"># type: ignore[attr-defined]</span>
<span class="n">notna</span> <span class="o">=</span> <span class="n">notnull</span>
<span class="c1"># TODO: axis and many arguments should be implemented.</span>
<span class="k">def</span> <span class="nf">all</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return whether all elements are True.</span>
<span class="sd"> Returns True unless there at least one element within a series that is</span>
<span class="sd"> False or equivalent (e.g. zero or empty)</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {0 or &#39;index&#39;}, default 0</span>
<span class="sd"> Indicate which axis or axes should be reduced.</span>
<span class="sd"> * 0 / &#39;index&#39; : reduce the index, return a Series whose index is the</span>
<span class="sd"> original column labels.</span>
<span class="sd"> skipna : boolean, default True</span>
<span class="sd"> Exclude NA values, such as None or numpy.NaN.</span>
<span class="sd"> If an entire row/column is NA values and `skipna` is True,</span>
<span class="sd"> then the result will be True, as for an empty row/column.</span>
<span class="sd"> If `skipna` is False, numpy.NaNs are treated as True because these are</span>
<span class="sd"> not equal to zero, Nones are treated as False.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([True, True]).all()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([True, False]).all()</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([0, 1]).all()</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1, 2, 3]).all()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([True, True, None]).all()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([True, True, None]).all(skipna=False)</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([True, False, None]).all()</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([]).all()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([np.nan]).all()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([np.nan]).all(skipna=False)</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([None]).all()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([None]).all(skipna=False)</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; df = ps.Series([True, False, None]).rename(&quot;a&quot;).to_frame()</span>
<span class="sd"> &gt;&gt;&gt; df.set_index(&quot;a&quot;).index.all()</span>
<span class="sd"> False</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s1">&#39;axis should be either 0 or &quot;index&quot; currently.&#39;</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="c1"># `any` and `every` was added as of Spark 3.0.</span>
<span class="c1"># ret = sdf.select(F.expr(&quot;every(CAST(`%s` AS BOOLEAN))&quot; % sdf.columns[0])).collect()[0][0]</span>
<span class="c1"># We use min as its alternative as below.</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">)</span> <span class="ow">or</span> <span class="n">skipna</span><span class="p">:</span>
<span class="c1"># np.nan takes no effect to the result; None takes no effect if `skipna`</span>
<span class="n">ret</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">&quot;boolean&quot;</span><span class="p">),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">True</span><span class="p">))))</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># Take None as False when not `skipna`</span>
<span class="n">ret</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">))</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">&quot;boolean&quot;</span><span class="p">)))</span>
<span class="p">)</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span>
<span class="k">if</span> <span class="n">ret</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">True</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">ret</span>
<span class="c1"># TODO: axis, skipna, and many arguments should be implemented.</span>
<span class="k">def</span> <span class="nf">any</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return whether any element is True.</span>
<span class="sd"> Returns False unless there is at least one element within a series that is</span>
<span class="sd"> True or equivalent (e.g. non-zero or non-empty).</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {0 or &#39;index&#39;}, default 0</span>
<span class="sd"> Indicate which axis or axes should be reduced.</span>
<span class="sd"> * 0 / &#39;index&#39; : reduce the index, return a Series whose index is the</span>
<span class="sd"> original column labels.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([False, False]).any()</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([True, False]).any()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([0, 0]).any()</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([0, 1, 2]).any()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([False, False, None]).any()</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([True, False, None]).any()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([]).any()</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([np.nan]).any()</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; df = ps.Series([True, False, None]).rename(&quot;a&quot;).to_frame()</span>
<span class="sd"> &gt;&gt;&gt; df.set_index(&quot;a&quot;).index.any()</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s1">&#39;axis should be either 0 or &quot;index&quot; currently.&#39;</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">sdf</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="c1"># Note that we&#39;re ignoring `None`s here for now.</span>
<span class="c1"># any and every was added as of Spark 3.0</span>
<span class="c1"># ret = sdf.select(F.expr(&quot;any(CAST(`%s` AS BOOLEAN))&quot; % sdf.columns[0])).collect()[0][0]</span>
<span class="c1"># Here we use max as its alternative:</span>
<span class="n">ret</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">&quot;boolean&quot;</span><span class="p">),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">))))</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span>
<span class="k">if</span> <span class="n">ret</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">False</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">ret</span>
<span class="c1"># TODO: add frep and axis parameter</span>
<span class="k">def</span> <span class="nf">shift</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">,</span> <span class="n">periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">fill_value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Shift Series/Index by desired number of periods.</span>
<span class="sd"> .. note:: the current implementation of shift uses Spark&#39;s Window without</span>
<span class="sd"> specifying partition specification. This leads to moveing all data into</span>
<span class="sd"> a single partition in a single machine and could cause serious</span>
<span class="sd"> performance degradation. Avoid this method with very large datasets.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> periods : int</span>
<span class="sd"> Number of periods to shift. Can be positive or negative.</span>
<span class="sd"> fill_value : object, optional</span>
<span class="sd"> The scalar value to use for newly introduced missing values.</span>
<span class="sd"> The default depends on the dtype of self. For numeric data, np.nan is used.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Copy of input Series/Index, shifted.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;Col1&#39;: [10, 20, 15, 30, 45],</span>
<span class="sd"> ... &#39;Col2&#39;: [13, 23, 18, 33, 48],</span>
<span class="sd"> ... &#39;Col3&#39;: [17, 27, 22, 37, 52]},</span>
<span class="sd"> ... columns=[&#39;Col1&#39;, &#39;Col2&#39;, &#39;Col3&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.Col1.shift(periods=3)</span>
<span class="sd"> 0 NaN</span>
<span class="sd"> 1 NaN</span>
<span class="sd"> 2 NaN</span>
<span class="sd"> 3 10.0</span>
<span class="sd"> 4 20.0</span>
<span class="sd"> Name: Col1, dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.Col2.shift(periods=3, fill_value=0)</span>
<span class="sd"> 0 0</span>
<span class="sd"> 1 0</span>
<span class="sd"> 2 0</span>
<span class="sd"> 3 13</span>
<span class="sd"> 4 23</span>
<span class="sd"> Name: Col2, dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; df.index.shift(periods=3, fill_value=0)</span>
<span class="sd"> Index([0, 0, 0, 0, 1], dtype=&#39;int64&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_shift</span><span class="p">(</span><span class="n">periods</span><span class="p">,</span> <span class="n">fill_value</span><span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">analyzed</span>
<span class="k">def</span> <span class="nf">_shift</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">,</span>
<span class="n">periods</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
<span class="n">fill_value</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">part_cols</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="p">(),</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">periods</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;periods should be an int; however, got [</span><span class="si">%s</span><span class="s2">]&quot;</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">periods</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="k">if</span> <span class="n">periods</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">col</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="n">window</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">part_cols</span><span class="p">)</span>
<span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span>
<span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span><span class="o">-</span><span class="n">periods</span><span class="p">,</span> <span class="o">-</span><span class="n">periods</span><span class="p">)</span>
<span class="p">)</span>
<span class="n">lag_col</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">lag</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">periods</span><span class="p">)</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">)</span>
<span class="n">col</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">lag_col</span><span class="o">.</span><span class="n">isNull</span><span class="p">()</span> <span class="o">|</span> <span class="n">F</span><span class="o">.</span><span class="n">isnan</span><span class="p">(</span><span class="n">lag_col</span><span class="p">),</span> <span class="n">fill_value</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">lag_col</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">field</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">nullable</span><span class="o">=</span><span class="kc">True</span><span class="p">))</span>
<span class="c1"># TODO: Update Documentation for Bins Parameter when its supported</span>
<span class="k">def</span> <span class="nf">value_counts</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">normalize</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">sort</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">bins</span><span class="p">:</span> <span class="kc">None</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Series&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return a Series containing counts of unique values.</span>
<span class="sd"> The resulting object will be in descending order so that the</span>
<span class="sd"> first element is the most frequently-occurring element.</span>
<span class="sd"> Excludes NA values by default.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> normalize : boolean, default False</span>
<span class="sd"> If True then the object returned will contain the relative</span>
<span class="sd"> frequencies of the unique values.</span>
<span class="sd"> sort : boolean, default True</span>
<span class="sd"> Sort by values.</span>
<span class="sd"> ascending : boolean, default False</span>
<span class="sd"> Sort in ascending order.</span>
<span class="sd"> bins : Not Yet Supported</span>
<span class="sd"> dropna : boolean, default True</span>
<span class="sd"> Don&#39;t include counts of NaN.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> counts : Series</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.count: Number of non-NA elements in a Series.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> For Series</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;x&#39;:[0, 0, 1, 1, 1, np.nan]})</span>
<span class="sd"> &gt;&gt;&gt; df.x.value_counts() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> x</span>
<span class="sd"> 1.0 3</span>
<span class="sd"> 0.0 2</span>
<span class="sd"> Name: count, dtype: int64</span>
<span class="sd"> With `normalize` set to `True`, returns the relative frequency by</span>
<span class="sd"> dividing all values by the sum of values.</span>
<span class="sd"> &gt;&gt;&gt; df.x.value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> x</span>
<span class="sd"> 1.0 0.6</span>
<span class="sd"> 0.0 0.4</span>
<span class="sd"> Name: proportion, dtype: float64</span>
<span class="sd"> **dropna**</span>
<span class="sd"> With `dropna` set to `False` we can also see NaN index values.</span>
<span class="sd"> &gt;&gt;&gt; df.x.value_counts(dropna=False) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> x</span>
<span class="sd"> 1.0 3</span>
<span class="sd"> 0.0 2</span>
<span class="sd"> NaN 1</span>
<span class="sd"> Name: count, dtype: int64</span>
<span class="sd"> For Index</span>
<span class="sd"> &gt;&gt;&gt; idx = ps.Index([3, 1, 2, 3, 4, np.nan])</span>
<span class="sd"> &gt;&gt;&gt; idx</span>
<span class="sd"> Index([3.0, 1.0, 2.0, 3.0, 4.0, nan], dtype=&#39;float64&#39;)</span>
<span class="sd"> &gt;&gt;&gt; idx.value_counts().sort_index()</span>
<span class="sd"> 1.0 1</span>
<span class="sd"> 2.0 1</span>
<span class="sd"> 3.0 2</span>
<span class="sd"> 4.0 1</span>
<span class="sd"> Name: count, dtype: int64</span>
<span class="sd"> **sort**</span>
<span class="sd"> With `sort` set to `False`, the result wouldn&#39;t be sorted by number of count.</span>
<span class="sd"> &gt;&gt;&gt; idx.value_counts(sort=True).sort_index()</span>
<span class="sd"> 1.0 1</span>
<span class="sd"> 2.0 1</span>
<span class="sd"> 3.0 2</span>
<span class="sd"> 4.0 1</span>
<span class="sd"> Name: count, dtype: int64</span>
<span class="sd"> **normalize**</span>
<span class="sd"> With `normalize` set to `True`, returns the relative frequency by</span>
<span class="sd"> dividing all values by the sum of values.</span>
<span class="sd"> &gt;&gt;&gt; idx.value_counts(normalize=True).sort_index()</span>
<span class="sd"> 1.0 0.2</span>
<span class="sd"> 2.0 0.2</span>
<span class="sd"> 3.0 0.4</span>
<span class="sd"> 4.0 0.2</span>
<span class="sd"> Name: proportion, dtype: float64</span>
<span class="sd"> **dropna**</span>
<span class="sd"> With `dropna` set to `False` we can also see NaN index values.</span>
<span class="sd"> &gt;&gt;&gt; idx.value_counts(dropna=False).sort_index() # doctest: +SKIP</span>
<span class="sd"> 1.0 1</span>
<span class="sd"> 2.0 1</span>
<span class="sd"> 3.0 2</span>
<span class="sd"> 4.0 1</span>
<span class="sd"> NaN 1</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> For MultiIndex.</span>
<span class="sd"> &gt;&gt;&gt; midx = pd.MultiIndex([[&#39;lama&#39;, &#39;cow&#39;, &#39;falcon&#39;],</span>
<span class="sd"> ... [&#39;speed&#39;, &#39;weight&#39;, &#39;length&#39;]],</span>
<span class="sd"> ... [[0, 0, 0, 1, 1, 1, 2, 2, 2],</span>
<span class="sd"> ... [1, 1, 1, 1, 1, 2, 1, 2, 2]])</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)</span>
<span class="sd"> &gt;&gt;&gt; s.index # doctest: +SKIP</span>
<span class="sd"> MultiIndex([( &#39;lama&#39;, &#39;weight&#39;),</span>
<span class="sd"> ( &#39;lama&#39;, &#39;weight&#39;),</span>
<span class="sd"> ( &#39;lama&#39;, &#39;weight&#39;),</span>
<span class="sd"> ( &#39;cow&#39;, &#39;weight&#39;),</span>
<span class="sd"> ( &#39;cow&#39;, &#39;weight&#39;),</span>
<span class="sd"> ( &#39;cow&#39;, &#39;length&#39;),</span>
<span class="sd"> (&#39;falcon&#39;, &#39;weight&#39;),</span>
<span class="sd"> (&#39;falcon&#39;, &#39;length&#39;),</span>
<span class="sd"> (&#39;falcon&#39;, &#39;length&#39;)],</span>
<span class="sd"> )</span>
<span class="sd"> &gt;&gt;&gt; s.index.value_counts().sort_index()</span>
<span class="sd"> (cow, length) 1</span>
<span class="sd"> (cow, weight) 2</span>
<span class="sd"> (falcon, length) 2</span>
<span class="sd"> (falcon, weight) 1</span>
<span class="sd"> (lama, weight) 3</span>
<span class="sd"> Name: count, dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; s.index.value_counts(normalize=True).sort_index()</span>
<span class="sd"> (cow, length) 0.111111</span>
<span class="sd"> (cow, weight) 0.222222</span>
<span class="sd"> (falcon, length) 0.222222</span>
<span class="sd"> (falcon, weight) 0.111111</span>
<span class="sd"> (lama, weight) 0.333333</span>
<span class="sd"> Name: proportion, dtype: float64</span>
<span class="sd"> If Index has name, keep the name up.</span>
<span class="sd"> &gt;&gt;&gt; idx = ps.Index([0, 0, 0, 1, 1, 2, 3], name=&#39;pandas-on-Spark&#39;)</span>
<span class="sd"> &gt;&gt;&gt; idx.value_counts().sort_index()</span>
<span class="sd"> pandas-on-Spark</span>
<span class="sd"> 0 3</span>
<span class="sd"> 1 2</span>
<span class="sd"> 2 1</span>
<span class="sd"> 3 1</span>
<span class="sd"> Name: count, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">first_series</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.indexes.multi</span> <span class="kn">import</span> <span class="n">MultiIndex</span>
<span class="k">if</span> <span class="n">bins</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;value_counts currently does not support bins&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">dropna</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">MultiIndex</span><span class="p">):</span>
<span class="c1"># If even one StructField is null, that row should be dropped.</span>
<span class="n">index_spark_column_names</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="n">cond</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">)</span>
<span class="k">for</span> <span class="n">index_spark_column_name</span> <span class="ow">in</span> <span class="n">index_spark_column_names</span><span class="p">:</span>
<span class="n">cond</span> <span class="o">=</span> <span class="n">cond</span> <span class="o">|</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">getItem</span><span class="p">(</span><span class="n">index_spark_column_name</span><span class="p">)</span><span class="o">.</span><span class="n">isNull</span><span class="p">()</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span>
<span class="n">sdf_dropna</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="o">~</span><span class="n">cond</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf_dropna</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span><span class="o">.</span><span class="n">dropna</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf_dropna</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span>
<span class="n">index_name</span> <span class="o">=</span> <span class="n">SPARK_DEFAULT_INDEX_NAME</span>
<span class="n">column_name</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf_dropna</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf_dropna</span><span class="p">,</span> <span class="n">column_name</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">index_name</span><span class="p">))</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="k">if</span> <span class="n">sort</span><span class="p">:</span>
<span class="k">if</span> <span class="n">ascending</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;count&quot;</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;count&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">desc</span><span class="p">())</span>
<span class="k">if</span> <span class="n">normalize</span><span class="p">:</span>
<span class="n">result_column_name</span> <span class="o">=</span> <span class="s2">&quot;proportion&quot;</span>
<span class="n">drop_sum</span> <span class="o">=</span> <span class="n">sdf_dropna</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="s2">&quot;count&quot;</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="s2">&quot;count&quot;</span><span class="p">)</span> <span class="o">/</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">drop_sum</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">result_column_name</span> <span class="o">=</span> <span class="s2">&quot;count&quot;</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_name</span><span class="p">)],</span>
<span class="n">index_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">,</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">[(</span><span class="n">result_column_name</span><span class="p">,)],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">&quot;count&quot;</span><span class="p">)],</span>
<span class="n">column_label_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">nunique</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">approx</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">rsd</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return number of unique elements in the object.</span>
<span class="sd"> Excludes NA values by default.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> dropna : bool, default True</span>
<span class="sd"> Don’t include NaN in the count.</span>
<span class="sd"> approx: bool, default False</span>
<span class="sd"> If False, will use the exact algorithm and return the exact number of unique.</span>
<span class="sd"> If True, it uses the HyperLogLog approximate algorithm, which is significantly faster</span>
<span class="sd"> for large amount of data.</span>
<span class="sd"> Note: This parameter is specific to pandas-on-Spark and is not found in pandas.</span>
<span class="sd"> rsd: float, default 0.05</span>
<span class="sd"> Maximum estimation error allowed in the HyperLogLog algorithm.</span>
<span class="sd"> Note: Just like ``approx`` this parameter is specific to pandas-on-Spark.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> int</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.nunique: Method nunique for DataFrame.</span>
<span class="sd"> Series.count: Count non-NA/null observations in the Series.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1, 2, 3, np.nan]).nunique()</span>
<span class="sd"> 3</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1, 2, 3, np.nan]).nunique(dropna=False)</span>
<span class="sd"> 4</span>
<span class="sd"> On big data, we recommend using the approximate algorithm to speed up this function.</span>
<span class="sd"> The result will be very close to the exact unique count.</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1, 2, 3, np.nan]).nunique(approx=True)</span>
<span class="sd"> 3</span>
<span class="sd"> &gt;&gt;&gt; idx = ps.Index([1, 1, 2, None])</span>
<span class="sd"> &gt;&gt;&gt; idx</span>
<span class="sd"> Index([1.0, 1.0, 2.0, nan], dtype=&#39;float64&#39;)</span>
<span class="sd"> &gt;&gt;&gt; idx.nunique()</span>
<span class="sd"> 2</span>
<span class="sd"> &gt;&gt;&gt; idx.nunique(dropna=False)</span>
<span class="sd"> 3</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">res</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">([</span><span class="bp">self</span><span class="o">.</span><span class="n">_nunique</span><span class="p">(</span><span class="n">dropna</span><span class="p">,</span> <span class="n">approx</span><span class="p">,</span> <span class="n">rsd</span><span class="p">)])</span>
<span class="k">return</span> <span class="n">res</span><span class="o">.</span><span class="n">collect</span><span class="p">()[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">]</span>
<span class="k">def</span> <span class="nf">_nunique</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">approx</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">rsd</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.05</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">colname</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">count_fn</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span>
<span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span>
<span class="n">partial</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">approx_count_distinct</span><span class="p">,</span> <span class="n">rsd</span><span class="o">=</span><span class="n">rsd</span><span class="p">)</span> <span class="k">if</span> <span class="n">approx</span> <span class="k">else</span> <span class="n">F</span><span class="o">.</span><span class="n">countDistinct</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">dropna</span><span class="p">:</span>
<span class="k">return</span> <span class="n">count_fn</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">colname</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="p">(</span>
<span class="n">count_fn</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span>
<span class="o">+</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="kc">None</span><span class="p">))</span> <span class="o">&gt;=</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span>
<span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
<span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">colname</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">take</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">,</span> <span class="n">indices</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="nb">int</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">IndexOpsLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the elements in the given *positional* indices along an axis.</span>
<span class="sd"> This means that we are not indexing according to actual values in</span>
<span class="sd"> the index attribute of the object. We are indexing according to the</span>
<span class="sd"> actual position of the element in the object.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> indices : array-like</span>
<span class="sd"> An array of ints indicating which positions to take.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> taken : same type as caller</span>
<span class="sd"> An array-like containing the elements taken from the object.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.loc : Select a subset of a DataFrame by labels.</span>
<span class="sd"> DataFrame.iloc : Select a subset of a DataFrame by positions.</span>
<span class="sd"> numpy.take : Take elements from an array along an axis.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Series</span>
<span class="sd"> &gt;&gt;&gt; psser = ps.Series([100, 200, 300, 400, 500])</span>
<span class="sd"> &gt;&gt;&gt; psser</span>
<span class="sd"> 0 100</span>
<span class="sd"> 1 200</span>
<span class="sd"> 2 300</span>
<span class="sd"> 3 400</span>
<span class="sd"> 4 500</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; psser.take([0, 2, 4]).sort_index()</span>
<span class="sd"> 0 100</span>
<span class="sd"> 2 300</span>
<span class="sd"> 4 500</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> Index</span>
<span class="sd"> &gt;&gt;&gt; psidx = ps.Index([100, 200, 300, 400, 500])</span>
<span class="sd"> &gt;&gt;&gt; psidx</span>
<span class="sd"> Index([100, 200, 300, 400, 500], dtype=&#39;int64&#39;)</span>
<span class="sd"> &gt;&gt;&gt; psidx.take([0, 2, 4]).sort_values()</span>
<span class="sd"> Index([100, 300, 500], dtype=&#39;int64&#39;)</span>
<span class="sd"> MultiIndex</span>
<span class="sd"> &gt;&gt;&gt; psmidx = ps.MultiIndex.from_tuples([(&quot;x&quot;, &quot;a&quot;), (&quot;x&quot;, &quot;b&quot;), (&quot;x&quot;, &quot;c&quot;)])</span>
<span class="sd"> &gt;&gt;&gt; psmidx # doctest: +SKIP</span>
<span class="sd"> MultiIndex([(&#39;x&#39;, &#39;a&#39;),</span>
<span class="sd"> (&#39;x&#39;, &#39;b&#39;),</span>
<span class="sd"> (&#39;x&#39;, &#39;c&#39;)],</span>
<span class="sd"> )</span>
<span class="sd"> &gt;&gt;&gt; psmidx.take([0, 2]) # doctest: +SKIP</span>
<span class="sd"> MultiIndex([(&#39;x&#39;, &#39;a&#39;),</span>
<span class="sd"> (&#39;x&#39;, &#39;c&#39;)],</span>
<span class="sd"> )</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">indices</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">indices</span><span class="p">,</span> <span class="p">(</span><span class="nb">dict</span><span class="p">,</span> <span class="nb">set</span><span class="p">)):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;`indices` must be a list-like except dict or set&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">IndexOpsLike</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="n">indices</span><span class="p">])</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">IndexOpsLike</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="n">indices</span><span class="p">]</span><span class="o">.</span><span class="n">index</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">factorize</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">IndexOpsLike</span><span class="p">,</span> <span class="n">sort</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">use_na_sentinel</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">IndexOpsLike</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Encode the object as an enumerated type or categorical variable.</span>
<span class="sd"> This method is useful for obtaining a numeric representation of an</span>
<span class="sd"> array when all that matters is identifying distinct values.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> sort : bool, default True</span>
<span class="sd"> use_na_sentinel : bool, default True</span>
<span class="sd"> If True, the sentinel -1 will be used for NaN values, effectively assigning them</span>
<span class="sd"> a distinct category. If False, NaN values will be encoded as non-negative integers,</span>
<span class="sd"> treating them as unique categories in the encoding process and retaining them in the</span>
<span class="sd"> set of unique categories in the data.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> codes : Series or Index</span>
<span class="sd"> A Series or Index that&#39;s an indexer into `uniques`.</span>
<span class="sd"> ``uniques.take(codes)`` will have the same values as `values`.</span>
<span class="sd"> uniques : pd.Index</span>
<span class="sd"> The unique valid values.</span>
<span class="sd"> .. note ::</span>
<span class="sd"> Even if there&#39;s a missing value in `values`, `uniques` will</span>
<span class="sd"> *not* contain an entry for it.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; psser = ps.Series([&#39;b&#39;, None, &#39;a&#39;, &#39;c&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; codes, uniques = psser.factorize()</span>
<span class="sd"> &gt;&gt;&gt; codes</span>
<span class="sd"> 0 1</span>
<span class="sd"> 1 -1</span>
<span class="sd"> 2 0</span>
<span class="sd"> 3 2</span>
<span class="sd"> 4 1</span>
<span class="sd"> dtype: int32</span>
<span class="sd"> &gt;&gt;&gt; uniques</span>
<span class="sd"> Index([&#39;a&#39;, &#39;b&#39;, &#39;c&#39;], dtype=&#39;object&#39;)</span>
<span class="sd"> &gt;&gt;&gt; codes, uniques = psser.factorize(use_na_sentinel=False)</span>
<span class="sd"> &gt;&gt;&gt; codes</span>
<span class="sd"> 0 1</span>
<span class="sd"> 1 3</span>
<span class="sd"> 2 0</span>
<span class="sd"> 3 2</span>
<span class="sd"> 4 1</span>
<span class="sd"> dtype: int32</span>
<span class="sd"> &gt;&gt;&gt; uniques</span>
<span class="sd"> Index([&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, None], dtype=&#39;object&#39;)</span>
<span class="sd"> For Index:</span>
<span class="sd"> &gt;&gt;&gt; psidx = ps.Index([&#39;b&#39;, None, &#39;a&#39;, &#39;c&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; codes, uniques = psidx.factorize()</span>
<span class="sd"> &gt;&gt;&gt; codes</span>
<span class="sd"> Index([1, -1, 0, 2, 1], dtype=&#39;int32&#39;)</span>
<span class="sd"> &gt;&gt;&gt; uniques</span>
<span class="sd"> Index([&#39;a&#39;, &#39;b&#39;, &#39;c&#39;], dtype=&#39;object&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">first_series</span>
<span class="k">assert</span> <span class="n">sort</span> <span class="ow">is</span> <span class="kc">True</span>
<span class="n">use_na_sentinel</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span> <span class="k">if</span> <span class="n">use_na_sentinel</span> <span class="k">else</span> <span class="kc">False</span> <span class="c1"># type: ignore[assignment]</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;Argument `na_sentinel` will be removed in 4.0.0.&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n">CategoricalDtype</span><span class="p">):</span>
<span class="n">categories</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">dtype</span><span class="o">.</span><span class="n">categories</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">categories</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span>
<span class="k">for</span> <span class="n">code</span><span class="p">,</span> <span class="n">category</span> <span class="ow">in</span> <span class="nb">reversed</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="nb">enumerate</span><span class="p">(</span><span class="n">categories</span><span class="p">))):</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> <span class="o">==</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">code</span><span class="p">),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">category</span><span class="p">))</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">scol</span><span class="p">)</span>
<span class="n">codes</span><span class="p">,</span> <span class="n">uniques</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span>
<span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="p">)</span><span class="o">.</span><span class="n">factorize</span><span class="p">(</span><span class="n">use_na_sentinel</span><span class="o">=</span><span class="n">use_na_sentinel</span><span class="p">)</span>
<span class="k">return</span> <span class="n">codes</span><span class="p">,</span> <span class="n">uniques</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span>
<span class="n">uniq_sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span><span class="o">.</span><span class="n">distinct</span><span class="p">()</span>
<span class="c1"># Check number of uniques and constructs sorted `uniques_list`</span>
<span class="n">max_compute_count</span> <span class="o">=</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">&quot;compute.max_rows&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">max_compute_count</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">uniq_pdf</span> <span class="o">=</span> <span class="n">uniq_sdf</span><span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="n">max_compute_count</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">toPandas</span><span class="p">()</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">uniq_pdf</span><span class="p">)</span> <span class="o">&gt;</span> <span class="n">max_compute_count</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Current Series has more then </span><span class="si">{0}</span><span class="s2"> unique values. &quot;</span>
<span class="s2">&quot;Please set &#39;compute.max_rows&#39; by using &#39;pyspark.pandas.config.set_option&#39; &quot;</span>
<span class="s2">&quot;to more than </span><span class="si">{0}</span><span class="s2"> rows. Note that, before changing the &quot;</span>
<span class="s2">&quot;&#39;compute.max_rows&#39;, this operation is considerably expensive.&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">max_compute_count</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">uniq_pdf</span> <span class="o">=</span> <span class="n">uniq_sdf</span><span class="o">.</span><span class="n">toPandas</span><span class="p">()</span>
<span class="c1"># pandas takes both NaN and null in Spark to np.nan, so de-duplication is required</span>
<span class="n">uniq_series</span> <span class="o">=</span> <span class="n">first_series</span><span class="p">(</span><span class="n">uniq_pdf</span><span class="p">)</span><span class="o">.</span><span class="n">drop_duplicates</span><span class="p">()</span>
<span class="n">uniques_list</span> <span class="o">=</span> <span class="n">uniq_series</span><span class="o">.</span><span class="n">tolist</span><span class="p">()</span>
<span class="n">uniques_list</span> <span class="o">=</span> <span class="nb">sorted</span><span class="p">(</span><span class="n">uniques_list</span><span class="p">,</span> <span class="n">key</span><span class="o">=</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">isna</span><span class="p">(</span><span class="n">x</span><span class="p">),</span> <span class="n">x</span><span class="p">))</span>
<span class="c1"># Constructs `unique_to_code` mapping non-na unique to code</span>
<span class="n">unique_to_code</span> <span class="o">=</span> <span class="p">{}</span>
<span class="k">if</span> <span class="n">use_na_sentinel</span><span class="p">:</span>
<span class="n">na_sentinel_code</span> <span class="o">=</span> <span class="n">use_na_sentinel</span>
<span class="n">code</span> <span class="o">=</span> <span class="mi">0</span>
<span class="k">for</span> <span class="n">unique</span> <span class="ow">in</span> <span class="n">uniques_list</span><span class="p">:</span>
<span class="k">if</span> <span class="n">pd</span><span class="o">.</span><span class="n">isna</span><span class="p">(</span><span class="n">unique</span><span class="p">):</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">use_na_sentinel</span><span class="p">:</span>
<span class="n">na_sentinel_code</span> <span class="o">=</span> <span class="n">code</span> <span class="c1"># type: ignore[assignment]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">unique_to_code</span><span class="p">[</span><span class="n">unique</span><span class="p">]</span> <span class="o">=</span> <span class="n">code</span>
<span class="n">code</span> <span class="o">+=</span> <span class="mi">1</span>
<span class="n">kvs</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span>
<span class="n">chain</span><span class="p">(</span><span class="o">*</span><span class="p">([(</span><span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">unique</span><span class="p">),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">code</span><span class="p">))</span> <span class="k">for</span> <span class="n">unique</span><span class="p">,</span> <span class="n">code</span> <span class="ow">in</span> <span class="n">unique_to_code</span><span class="o">.</span><span class="n">items</span><span class="p">()]))</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">kvs</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> <span class="c1"># uniques are all missing values</span>
<span class="n">new_scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">na_sentinel_code</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">map_scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">create_map</span><span class="p">(</span><span class="o">*</span><span class="n">kvs</span><span class="p">)</span>
<span class="n">null_scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">isnull</span><span class="p">()</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">na_sentinel_code</span><span class="p">))</span>
<span class="n">new_scol</span> <span class="o">=</span> <span class="n">null_scol</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">map_scol</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">])</span>
<span class="n">codes</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span><span class="n">new_scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]))</span>
<span class="k">if</span> <span class="n">use_na_sentinel</span><span class="p">:</span>
<span class="c1"># Drops the NaN from the uniques of the values</span>
<span class="n">uniques_list</span> <span class="o">=</span> <span class="p">[</span><span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">uniques_list</span> <span class="k">if</span> <span class="ow">not</span> <span class="n">pd</span><span class="o">.</span><span class="n">isna</span><span class="p">(</span><span class="n">x</span><span class="p">)]</span>
<span class="n">uniques</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">(</span><span class="n">uniques_list</span><span class="p">)</span>
<span class="k">return</span> <span class="n">codes</span><span class="p">,</span> <span class="n">uniques</span>
<span class="k">def</span> <span class="nf">_test</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">doctest</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span>
<span class="kn">import</span> <span class="nn">pyspark.pandas.base</span>
<span class="n">os</span><span class="o">.</span><span class="n">chdir</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;SPARK_HOME&quot;</span><span class="p">])</span>
<span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">base</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;ps&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span>
<span class="n">spark</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">&quot;local[4]&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">&quot;pyspark.pandas.base tests&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span>
<span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">base</span><span class="p">,</span>
<span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span>
<span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
<span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span>
<span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
<span class="n">_test</span><span class="p">()</span>
</pre></div>
</article>
<footer class="bd-footer-article">
<div class="footer-article-items footer-article__inner">
<div class="footer-article-item"><!-- Previous / next buttons -->
<div class="prev-next-area">
</div></div>
</div>
</footer>
</div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script>
<script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item"><p class="copyright">
Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
</p></div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item"><p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3.
</p></div>
</div>
</div>
</footer>
</body>
</html>