blob: 9a78fea6d8a8a5072942a8a33d76803cae4dc14d [file] [log] [blame]
<!DOCTYPE html>
<html >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>pyspark.pandas.generic &#8212; PySpark 4.0.0-preview1 documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../../../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52" />
<link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" />
<script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/jquery.js"></script>
<script src="../../../_static/underscore.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/clipboard.min.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = '_modules/pyspark/pandas/generic';</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/pandas/generic.html" />
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a class="skip-link" href="#main-content">Skip to main content</a>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../../../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<nav class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../../../index.html">
<img src="../../../_static/spark-logo-light.png" class="logo__image only-light" alt="Logo image"/>
<script>document.write(`<img src="../../../_static/spark-logo-dark.png" class="logo__image only-dark" alt="Logo image"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/pandas/generic.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
</div>
</nav>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar hide-on-wide">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/pandas/generic.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumbs">
<ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb">
<li class="breadcrumb-item breadcrumb-home">
<a href="../../../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="../../index.html" class="nav-link">Module code</a></li>
<li class="breadcrumb-item active" aria-current="page">pyspark.pandas.generic</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article" role="main">
<h1>Source code for pyspark.pandas.generic</h1><div class="highlight"><pre>
<span></span><span class="c1">#</span>
<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c1"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c1"># this work for additional information regarding copyright ownership.</span>
<span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c1"># (the &quot;License&quot;); you may not use this file except in compliance with</span>
<span class="c1"># the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c1"># distributed under the License is distributed on an &quot;AS IS&quot; BASIS,</span>
<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="c1">#</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd">A base class of DataFrame/Column to behave like pandas DataFrame/Series.</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">abc</span> <span class="kn">import</span> <span class="n">ABCMeta</span><span class="p">,</span> <span class="n">abstractmethod</span>
<span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">reduce</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">Any</span><span class="p">,</span>
<span class="n">Callable</span><span class="p">,</span>
<span class="n">Dict</span><span class="p">,</span>
<span class="n">IO</span><span class="p">,</span>
<span class="n">List</span><span class="p">,</span>
<span class="n">Optional</span><span class="p">,</span>
<span class="n">NoReturn</span><span class="p">,</span>
<span class="n">Tuple</span><span class="p">,</span>
<span class="n">Union</span><span class="p">,</span>
<span class="n">TYPE_CHECKING</span><span class="p">,</span>
<span class="n">cast</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">import</span> <span class="nn">warnings</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<span class="kn">from</span> <span class="nn">pandas.api.types</span> <span class="kn">import</span> <span class="n">is_list_like</span> <span class="c1"># type: ignore[attr-defined]</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Column</span><span class="p">,</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">F</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">BooleanType</span><span class="p">,</span>
<span class="n">DoubleType</span><span class="p">,</span>
<span class="n">LongType</span><span class="p">,</span>
<span class="n">NumericType</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">pandas</span> <span class="k">as</span> <span class="n">ps</span> <span class="c1"># For running doctests and reference resolution in PyCharm.</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas._typing</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">Axis</span><span class="p">,</span>
<span class="n">DataFrameOrSeries</span><span class="p">,</span>
<span class="n">Dtype</span><span class="p">,</span>
<span class="n">FrameLike</span><span class="p">,</span>
<span class="n">Label</span><span class="p">,</span>
<span class="n">Name</span><span class="p">,</span>
<span class="n">Scalar</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.indexing</span> <span class="kn">import</span> <span class="n">AtIndexer</span><span class="p">,</span> <span class="n">iAtIndexer</span><span class="p">,</span> <span class="n">iLocIndexer</span><span class="p">,</span> <span class="n">LocIndexer</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.internal</span> <span class="kn">import</span> <span class="n">InternalFrame</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.spark</span> <span class="kn">import</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">SF</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.typedef</span> <span class="kn">import</span> <span class="n">spark_type_to_pandas_dtype</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.utils</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">is_name_like_tuple</span><span class="p">,</span>
<span class="n">is_name_like_value</span><span class="p">,</span>
<span class="n">name_like_string</span><span class="p">,</span>
<span class="n">scol_for</span><span class="p">,</span>
<span class="n">sql_conf</span><span class="p">,</span>
<span class="n">validate_arguments_and_invoke_function</span><span class="p">,</span>
<span class="n">validate_axis</span><span class="p">,</span>
<span class="n">validate_mode</span><span class="p">,</span>
<span class="n">SPARK_CONF_ARROW_ENABLED</span><span class="p">,</span>
<span class="n">log_advice</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.frame</span> <span class="kn">import</span> <span class="n">DataFrame</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.indexes.base</span> <span class="kn">import</span> <span class="n">Index</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.groupby</span> <span class="kn">import</span> <span class="n">GroupBy</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">Rolling</span><span class="p">,</span> <span class="n">Expanding</span><span class="p">,</span> <span class="n">ExponentialMoving</span>
<span class="n">bool_type</span> <span class="o">=</span> <span class="nb">bool</span>
<span class="k">class</span> <span class="nc">Frame</span><span class="p">(</span><span class="nb">object</span><span class="p">,</span> <span class="n">metaclass</span><span class="o">=</span><span class="n">ABCMeta</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> The base class for both DataFrame and Series.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Any</span><span class="p">:</span>
<span class="k">pass</span>
<span class="nd">@property</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_internal</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">InternalFrame</span><span class="p">:</span>
<span class="k">pass</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_apply_series_op</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span>
<span class="n">op</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">&quot;Series&quot;</span><span class="p">],</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;Series&quot;</span><span class="p">,</span> <span class="n">Column</span><span class="p">]],</span>
<span class="n">should_resolve</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="k">pass</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_reduce_for_stat_function</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">sfun</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">&quot;Series&quot;</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span>
<span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;Series&quot;</span><span class="p">,</span> <span class="n">Scalar</span><span class="p">]:</span>
<span class="k">pass</span>
<span class="nd">@property</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">dtypes</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">,</span> <span class="n">Dtype</span><span class="p">]:</span>
<span class="k">pass</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">to_pandas</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">]:</span>
<span class="k">pass</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_to_pandas</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">]:</span>
<span class="k">pass</span>
<span class="nd">@property</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">index</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Index&quot;</span><span class="p">:</span>
<span class="k">pass</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">copy</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="k">pass</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_to_internal_pandas</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">]:</span>
<span class="k">pass</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">head</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="k">pass</span>
<span class="c1"># TODO: add &#39;axis&#39; parameter</span>
<span class="k">def</span> <span class="nf">cummin</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return cumulative minimum over a DataFrame or Series axis.</span>
<span class="sd"> Returns a DataFrame or Series of the same size containing the cumulative minimum.</span>
<span class="sd"> .. note:: the current implementation of cummin uses Spark&#39;s Window without</span>
<span class="sd"> specifying partition specification. This leads to moveing all data into a</span>
<span class="sd"> single partition in a single machine and could cause serious</span>
<span class="sd"> performance degradation. Avoid this method with very large datasets.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> skipna: boolean, default True</span>
<span class="sd"> Exclude NA/null values. If an entire row/column is NA, the result will be NA.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame or Series</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.min: Return the minimum over DataFrame axis.</span>
<span class="sd"> DataFrame.cummax: Return cumulative maximum over DataFrame axis.</span>
<span class="sd"> DataFrame.cummin: Return cumulative minimum over DataFrame axis.</span>
<span class="sd"> DataFrame.cumsum: Return cumulative sum over DataFrame axis.</span>
<span class="sd"> Series.min: Return the minimum over Series axis.</span>
<span class="sd"> Series.cummax: Return cumulative maximum over Series axis.</span>
<span class="sd"> Series.cummin: Return cumulative minimum over Series axis.</span>
<span class="sd"> Series.cumsum: Return cumulative sum over Series axis.</span>
<span class="sd"> Series.cumprod: Return cumulative product over Series axis.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame([[2.0, 1.0], [3.0, None], [1.0, 0.0]], columns=list(&#39;AB&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B</span>
<span class="sd"> 0 2.0 1.0</span>
<span class="sd"> 1 3.0 NaN</span>
<span class="sd"> 2 1.0 0.0</span>
<span class="sd"> By default, iterates over rows and finds the minimum in each column.</span>
<span class="sd"> &gt;&gt;&gt; df.cummin()</span>
<span class="sd"> A B</span>
<span class="sd"> 0 2.0 1.0</span>
<span class="sd"> 1 2.0 NaN</span>
<span class="sd"> 2 1.0 0.0</span>
<span class="sd"> It works identically in Series.</span>
<span class="sd"> &gt;&gt;&gt; df.A.cummin()</span>
<span class="sd"> 0 2.0</span>
<span class="sd"> 1 2.0</span>
<span class="sd"> 2 1.0</span>
<span class="sd"> Name: A, dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">_cum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">,</span> <span class="n">skipna</span><span class="p">),</span> <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="c1"># TODO: add &#39;axis&#39; parameter</span>
<span class="k">def</span> <span class="nf">cummax</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return cumulative maximum over a DataFrame or Series axis.</span>
<span class="sd"> Returns a DataFrame or Series of the same size containing the cumulative maximum.</span>
<span class="sd"> .. note:: the current implementation of cummax uses Spark&#39;s Window without</span>
<span class="sd"> specifying partition specification. This leads to moveing all data into a</span>
<span class="sd"> single partition in a single machine and could cause serious</span>
<span class="sd"> performance degradation. Avoid this method with very large datasets.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> skipna: boolean, default True</span>
<span class="sd"> Exclude NA/null values. If an entire row/column is NA, the result will be NA.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame or Series</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.max: Return the maximum over DataFrame axis.</span>
<span class="sd"> DataFrame.cummax: Return cumulative maximum over DataFrame axis.</span>
<span class="sd"> DataFrame.cummin: Return cumulative minimum over DataFrame axis.</span>
<span class="sd"> DataFrame.cumsum: Return cumulative sum over DataFrame axis.</span>
<span class="sd"> DataFrame.cumprod: Return cumulative product over DataFrame axis.</span>
<span class="sd"> Series.max: Return the maximum over Series axis.</span>
<span class="sd"> Series.cummax: Return cumulative maximum over Series axis.</span>
<span class="sd"> Series.cummin: Return cumulative minimum over Series axis.</span>
<span class="sd"> Series.cumsum: Return cumulative sum over Series axis.</span>
<span class="sd"> Series.cumprod: Return cumulative product over Series axis.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame([[2.0, 1.0], [3.0, None], [1.0, 0.0]], columns=list(&#39;AB&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B</span>
<span class="sd"> 0 2.0 1.0</span>
<span class="sd"> 1 3.0 NaN</span>
<span class="sd"> 2 1.0 0.0</span>
<span class="sd"> By default, iterates over rows and finds the maximum in each column.</span>
<span class="sd"> &gt;&gt;&gt; df.cummax()</span>
<span class="sd"> A B</span>
<span class="sd"> 0 2.0 1.0</span>
<span class="sd"> 1 3.0 NaN</span>
<span class="sd"> 2 3.0 1.0</span>
<span class="sd"> It works identically in Series.</span>
<span class="sd"> &gt;&gt;&gt; df.B.cummax()</span>
<span class="sd"> 0 1.0</span>
<span class="sd"> 1 NaN</span>
<span class="sd"> 2 1.0</span>
<span class="sd"> Name: B, dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">_cum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">,</span> <span class="n">skipna</span><span class="p">),</span> <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="c1"># TODO: add &#39;axis&#39; parameter</span>
<span class="k">def</span> <span class="nf">cumsum</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return cumulative sum over a DataFrame or Series axis.</span>
<span class="sd"> Returns a DataFrame or Series of the same size containing the cumulative sum.</span>
<span class="sd"> .. note:: the current implementation of cumsum uses Spark&#39;s Window without</span>
<span class="sd"> specifying partition specification. This leads to moveing all data into a</span>
<span class="sd"> single partition in a single machine and could cause serious</span>
<span class="sd"> performance degradation. Avoid this method with very large datasets.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> skipna: boolean, default True</span>
<span class="sd"> Exclude NA/null values. If an entire row/column is NA, the result will be NA.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame or Series</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.sum: Return the sum over DataFrame axis.</span>
<span class="sd"> DataFrame.cummax: Return cumulative maximum over DataFrame axis.</span>
<span class="sd"> DataFrame.cummin: Return cumulative minimum over DataFrame axis.</span>
<span class="sd"> DataFrame.cumsum: Return cumulative sum over DataFrame axis.</span>
<span class="sd"> DataFrame.cumprod: Return cumulative product over DataFrame axis.</span>
<span class="sd"> Series.sum: Return the sum over Series axis.</span>
<span class="sd"> Series.cummax: Return cumulative maximum over Series axis.</span>
<span class="sd"> Series.cummin: Return cumulative minimum over Series axis.</span>
<span class="sd"> Series.cumsum: Return cumulative sum over Series axis.</span>
<span class="sd"> Series.cumprod: Return cumulative product over Series axis.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame([[2.0, 1.0], [3.0, None], [1.0, 0.0]], columns=list(&#39;AB&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B</span>
<span class="sd"> 0 2.0 1.0</span>
<span class="sd"> 1 3.0 NaN</span>
<span class="sd"> 2 1.0 0.0</span>
<span class="sd"> By default, iterates over rows and finds the sum in each column.</span>
<span class="sd"> &gt;&gt;&gt; df.cumsum()</span>
<span class="sd"> A B</span>
<span class="sd"> 0 2.0 1.0</span>
<span class="sd"> 1 5.0 NaN</span>
<span class="sd"> 2 6.0 1.0</span>
<span class="sd"> It works identically in Series.</span>
<span class="sd"> &gt;&gt;&gt; df.A.cumsum()</span>
<span class="sd"> 0 2.0</span>
<span class="sd"> 1 5.0</span>
<span class="sd"> 2 6.0</span>
<span class="sd"> Name: A, dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">_cumsum</span><span class="p">(</span><span class="n">skipna</span><span class="p">),</span> <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="c1"># TODO: add &#39;axis&#39; parameter</span>
<span class="k">def</span> <span class="nf">cumprod</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return cumulative product over a DataFrame or Series axis.</span>
<span class="sd"> Returns a DataFrame or Series of the same size containing the cumulative product.</span>
<span class="sd"> .. note:: the current implementation of cumprod uses Spark&#39;s Window without</span>
<span class="sd"> specifying partition specification. This leads to moveing all data into a</span>
<span class="sd"> single partition in a single machine and could cause serious</span>
<span class="sd"> performance degradation. Avoid this method with very large datasets.</span>
<span class="sd"> .. note:: unlike pandas&#39;, pandas-on-Spark&#39;s emulates cumulative product by</span>
<span class="sd"> ``exp(sum(log(...)))`` trick. Therefore, it only works for positive numbers.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> skipna: boolean, default True</span>
<span class="sd"> Exclude NA/null values. If an entire row/column is NA, the result will be NA.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame or Series</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.cummax: Return cumulative maximum over DataFrame axis.</span>
<span class="sd"> DataFrame.cummin: Return cumulative minimum over DataFrame axis.</span>
<span class="sd"> DataFrame.cumsum: Return cumulative sum over DataFrame axis.</span>
<span class="sd"> DataFrame.cumprod: Return cumulative product over DataFrame axis.</span>
<span class="sd"> Series.cummax: Return cumulative maximum over Series axis.</span>
<span class="sd"> Series.cummin: Return cumulative minimum over Series axis.</span>
<span class="sd"> Series.cumsum: Return cumulative sum over Series axis.</span>
<span class="sd"> Series.cumprod: Return cumulative product over Series axis.</span>
<span class="sd"> Raises</span>
<span class="sd"> ------</span>
<span class="sd"> Exception: If the values is equal to or lower than 0.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame([[2.0, 1.0], [3.0, None], [4.0, 10.0]], columns=list(&#39;AB&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B</span>
<span class="sd"> 0 2.0 1.0</span>
<span class="sd"> 1 3.0 NaN</span>
<span class="sd"> 2 4.0 10.0</span>
<span class="sd"> By default, iterates over rows and finds the sum in each column.</span>
<span class="sd"> &gt;&gt;&gt; df.cumprod()</span>
<span class="sd"> A B</span>
<span class="sd"> 0 2.0 1.0</span>
<span class="sd"> 1 6.0 NaN</span>
<span class="sd"> 2 24.0 10.0</span>
<span class="sd"> It works identically in Series.</span>
<span class="sd"> &gt;&gt;&gt; df.A.cumprod()</span>
<span class="sd"> 0 2.0</span>
<span class="sd"> 1 6.0</span>
<span class="sd"> 2 24.0</span>
<span class="sd"> Name: A, dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">psser</span><span class="o">.</span><span class="n">_cumprod</span><span class="p">(</span><span class="n">skipna</span><span class="p">),</span> <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">pipe</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Any</span><span class="p">:</span>
<span class="w"> </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Apply func(self, \*args, \*\*kwargs).</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> func: function</span>
<span class="sd"> function to apply to the DataFrame.</span>
<span class="sd"> ``args``, and ``kwargs`` are passed into ``func``.</span>
<span class="sd"> Alternatively a ``(callable, data_keyword)`` tuple where</span>
<span class="sd"> ``data_keyword`` is a string indicating the keyword of</span>
<span class="sd"> ``callable`` that expects the DataFrames.</span>
<span class="sd"> args: iterable, optional</span>
<span class="sd"> positional arguments passed into ``func``.</span>
<span class="sd"> kwargs: mapping, optional</span>
<span class="sd"> a dictionary of keyword arguments passed into ``func``.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> object: the return type of ``func``.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Use ``.pipe`` when chaining together functions that expect</span>
<span class="sd"> Series, DataFrames or GroupBy objects. For example, given</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;category&#39;: [&#39;A&#39;, &#39;A&#39;, &#39;B&#39;],</span>
<span class="sd"> ... &#39;col1&#39;: [1, 2, 3],</span>
<span class="sd"> ... &#39;col2&#39;: [4, 5, 6]},</span>
<span class="sd"> ... columns=[&#39;category&#39;, &#39;col1&#39;, &#39;col2&#39;])</span>
<span class="sd"> &gt;&gt;&gt; def keep_category_a(df):</span>
<span class="sd"> ... return df[df[&#39;category&#39;] == &#39;A&#39;]</span>
<span class="sd"> &gt;&gt;&gt; def add_one(df, column):</span>
<span class="sd"> ... return df.assign(col3=df[column] + 1)</span>
<span class="sd"> &gt;&gt;&gt; def multiply(df, column1, column2):</span>
<span class="sd"> ... return df.assign(col4=df[column1] * df[column2])</span>
<span class="sd"> instead of writing</span>
<span class="sd"> &gt;&gt;&gt; multiply(add_one(keep_category_a(df), column=&quot;col1&quot;), column1=&quot;col2&quot;, column2=&quot;col3&quot;)</span>
<span class="sd"> category col1 col2 col3 col4</span>
<span class="sd"> 0 A 1 4 2 8</span>
<span class="sd"> 1 A 2 5 3 15</span>
<span class="sd"> You can write</span>
<span class="sd"> &gt;&gt;&gt; (df.pipe(keep_category_a)</span>
<span class="sd"> ... .pipe(add_one, column=&quot;col1&quot;)</span>
<span class="sd"> ... .pipe(multiply, column1=&quot;col2&quot;, column2=&quot;col3&quot;)</span>
<span class="sd"> ... )</span>
<span class="sd"> category col1 col2 col3 col4</span>
<span class="sd"> 0 A 1 4 2 8</span>
<span class="sd"> 1 A 2 5 3 15</span>
<span class="sd"> If you have a function that takes the data as the second</span>
<span class="sd"> argument, pass a tuple indicating which keyword expects the</span>
<span class="sd"> data. For example, suppose ``f`` takes its data as ``df``:</span>
<span class="sd"> &gt;&gt;&gt; def multiply_2(column1, df, column2):</span>
<span class="sd"> ... return df.assign(col4=df[column1] * df[column2])</span>
<span class="sd"> Then you can write</span>
<span class="sd"> &gt;&gt;&gt; (df.pipe(keep_category_a)</span>
<span class="sd"> ... .pipe(add_one, column=&quot;col1&quot;)</span>
<span class="sd"> ... .pipe((multiply_2, &#39;df&#39;), column1=&quot;col2&quot;, column2=&quot;col3&quot;)</span>
<span class="sd"> ... )</span>
<span class="sd"> category col1 col2 col3 col4</span>
<span class="sd"> 0 A 1 4 2 8</span>
<span class="sd"> 1 A 2 5 3 15</span>
<span class="sd"> You can use lambda as well</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1, 2, 3]).pipe(lambda x: (x + 1).rename(&quot;value&quot;))</span>
<span class="sd"> 0 2</span>
<span class="sd"> 1 3</span>
<span class="sd"> 2 4</span>
<span class="sd"> Name: value, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span>
<span class="n">func</span><span class="p">,</span> <span class="n">target</span> <span class="o">=</span> <span class="n">func</span>
<span class="k">if</span> <span class="n">target</span> <span class="ow">in</span> <span class="n">kwargs</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;</span><span class="si">%s</span><span class="s2"> is both the pipe target and a keyword &quot;</span> <span class="s2">&quot;argument&quot;</span> <span class="o">%</span> <span class="n">target</span><span class="p">)</span>
<span class="n">kwargs</span><span class="p">[</span><span class="n">target</span><span class="p">]</span> <span class="o">=</span> <span class="bp">self</span>
<span class="k">return</span> <span class="n">func</span><span class="p">(</span><span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">func</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">to_numpy</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A NumPy ndarray representing the values in this DataFrame or Series.</span>
<span class="sd"> .. note:: This method should only be used if the resulting NumPy ndarray is expected</span>
<span class="sd"> to be small, as all the data is loaded into the driver&#39;s memory.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> numpy.ndarray</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.DataFrame({&quot;A&quot;: [1, 2], &quot;B&quot;: [3, 4]}).to_numpy()</span>
<span class="sd"> array([[1, 3],</span>
<span class="sd"> [2, 4]])</span>
<span class="sd"> With heterogeneous data, the lowest common type will have to be used.</span>
<span class="sd"> &gt;&gt;&gt; ps.DataFrame({&quot;A&quot;: [1, 2], &quot;B&quot;: [3.0, 4.5]}).to_numpy()</span>
<span class="sd"> array([[1. , 3. ],</span>
<span class="sd"> [2. , 4.5]])</span>
<span class="sd"> For a mix of numeric and non-numeric types, the output array will have object dtype.</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&quot;A&quot;: [1, 2], &quot;B&quot;: [3.0, 4.5], &quot;C&quot;: pd.date_range(&#39;2000&#39;, periods=2)})</span>
<span class="sd"> &gt;&gt;&gt; df.to_numpy()</span>
<span class="sd"> array([[1, 3.0, Timestamp(&#39;2000-01-01 00:00:00&#39;)],</span>
<span class="sd"> [2, 4.5, Timestamp(&#39;2000-01-02 00:00:00&#39;)]], dtype=object)</span>
<span class="sd"> For Series,</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([&#39;a&#39;, &#39;b&#39;, &#39;a&#39;]).to_numpy()</span>
<span class="sd"> array([&#39;a&#39;, &#39;b&#39;, &#39;a&#39;], dtype=object)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;`to_numpy` loads all data into the driver&#39;s memory. &quot;</span>
<span class="s2">&quot;It should only be used if the resulting NumPy ndarray is expected to be small.&quot;</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_to_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">values</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">values</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return a Numpy representation of the DataFrame or the Series.</span>
<span class="sd"> .. warning:: We recommend using `DataFrame.to_numpy()` or `Series.to_numpy()` instead.</span>
<span class="sd"> .. note:: This method should only be used if the resulting NumPy ndarray is expected</span>
<span class="sd"> to be small, as all the data is loaded into the driver&#39;s memory.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> numpy.ndarray</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> A DataFrame where all columns are the same type (e.g., int64) results in an array of</span>
<span class="sd"> the same type.</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;age&#39;: [ 3, 29],</span>
<span class="sd"> ... &#39;height&#39;: [94, 170],</span>
<span class="sd"> ... &#39;weight&#39;: [31, 115]})</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> age height weight</span>
<span class="sd"> 0 3 94 31</span>
<span class="sd"> 1 29 170 115</span>
<span class="sd"> &gt;&gt;&gt; df.dtypes</span>
<span class="sd"> age int64</span>
<span class="sd"> height int64</span>
<span class="sd"> weight int64</span>
<span class="sd"> dtype: object</span>
<span class="sd"> &gt;&gt;&gt; df.values</span>
<span class="sd"> array([[ 3, 94, 31],</span>
<span class="sd"> [ 29, 170, 115]])</span>
<span class="sd"> A DataFrame with mixed type columns(e.g., str/object, int64, float32) results in an ndarray</span>
<span class="sd"> of the broadest type that accommodates these mixed types (e.g., object).</span>
<span class="sd"> &gt;&gt;&gt; df2 = ps.DataFrame([(&#39;parrot&#39;, 24.0, &#39;second&#39;),</span>
<span class="sd"> ... (&#39;lion&#39;, 80.5, &#39;first&#39;),</span>
<span class="sd"> ... (&#39;monkey&#39;, np.nan, None)],</span>
<span class="sd"> ... columns=(&#39;name&#39;, &#39;max_speed&#39;, &#39;rank&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df2.dtypes</span>
<span class="sd"> name object</span>
<span class="sd"> max_speed float64</span>
<span class="sd"> rank object</span>
<span class="sd"> dtype: object</span>
<span class="sd"> &gt;&gt;&gt; df2.values</span>
<span class="sd"> array([[&#39;parrot&#39;, 24.0, &#39;second&#39;],</span>
<span class="sd"> [&#39;lion&#39;, 80.5, &#39;first&#39;],</span>
<span class="sd"> [&#39;monkey&#39;, nan, None]], dtype=object)</span>
<span class="sd"> For Series,</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1, 2, 3]).values</span>
<span class="sd"> array([1, 2, 3])</span>
<span class="sd"> &gt;&gt;&gt; ps.Series(list(&#39;aabc&#39;)).values</span>
<span class="sd"> array([&#39;a&#39;, &#39;a&#39;, &#39;b&#39;, &#39;c&#39;], dtype=object)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span><span class="s2">&quot;We recommend using `</span><span class="si">{}</span><span class="s2">.to_numpy()` instead.&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_numpy</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">to_csv</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">path</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">sep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;,&quot;</span><span class="p">,</span>
<span class="n">na_rep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;&quot;</span><span class="p">,</span>
<span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">header</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">quotechar</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s1">&#39;&quot;&#39;</span><span class="p">,</span>
<span class="n">date_format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">escapechar</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">num_files</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">mode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;w&quot;</span><span class="p">,</span>
<span class="n">partition_cols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sa">r</span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Write object to a comma-separated values (csv) file.</span>
<span class="sd"> .. note:: pandas-on-Spark `to_csv` writes files to a path or URI. Unlike pandas&#39;,</span>
<span class="sd"> pandas-on-Spark respects HDFS&#39;s property such as &#39;fs.default.name&#39;.</span>
<span class="sd"> .. note:: pandas-on-Spark writes CSV files into the directory, `path`, and writes</span>
<span class="sd"> multiple `part-...` files in the directory when `path` is specified.</span>
<span class="sd"> This behavior was inherited from Apache Spark. The number of partitions can</span>
<span class="sd"> be controlled by `num_files`. This is deprecated.</span>
<span class="sd"> Use `DataFrame.spark.repartition` instead.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> path: str, default None</span>
<span class="sd"> File path. If None is provided the result is returned as a string.</span>
<span class="sd"> sep: str, default &#39;,&#39;</span>
<span class="sd"> String of length 1. Field delimiter for the output file.</span>
<span class="sd"> na_rep: str, default &#39;&#39;</span>
<span class="sd"> Missing data representation.</span>
<span class="sd"> columns: sequence, optional</span>
<span class="sd"> Columns to write.</span>
<span class="sd"> header: bool or list of str, default True</span>
<span class="sd"> Write out the column names. If a list of strings is given it is</span>
<span class="sd"> assumed to be aliases for the column names.</span>
<span class="sd"> quotechar: str, default &#39;\&quot;&#39;</span>
<span class="sd"> String of length 1. Character used to quote fields.</span>
<span class="sd"> date_format: str, default None</span>
<span class="sd"> Format string for datetime objects.</span>
<span class="sd"> escapechar: str, default None</span>
<span class="sd"> String of length 1. Character used to escape `sep` and `quotechar`</span>
<span class="sd"> when appropriate.</span>
<span class="sd"> num_files: the number of partitions to be written in `path` directory when</span>
<span class="sd"> this is a path. This is deprecated. Use `DataFrame.spark.repartition` instead.</span>
<span class="sd"> mode: str</span>
<span class="sd"> Python write mode, default &#39;w&#39;.</span>
<span class="sd"> .. note:: mode can accept the strings for Spark writing mode.</span>
<span class="sd"> Such as &#39;append&#39;, &#39;overwrite&#39;, &#39;ignore&#39;, &#39;error&#39;, &#39;errorifexists&#39;.</span>
<span class="sd"> - &#39;append&#39; (equivalent to &#39;a&#39;): Append the new data to existing data.</span>
<span class="sd"> - &#39;overwrite&#39; (equivalent to &#39;w&#39;): Overwrite existing data.</span>
<span class="sd"> - &#39;ignore&#39;: Silently ignore this operation if data already exists.</span>
<span class="sd"> - &#39;error&#39; or &#39;errorifexists&#39;: Throw an exception if data already exists.</span>
<span class="sd"> partition_cols: str or list of str, optional, default None</span>
<span class="sd"> Names of partitioning columns</span>
<span class="sd"> index_col: str or list of str, optional, default: None</span>
<span class="sd"> Column names to be used in Spark to represent pandas-on-Spark&#39;s index. The index name</span>
<span class="sd"> in pandas-on-Spark is ignored. By default, the index is always lost.</span>
<span class="sd"> options: keyword arguments for additional options specific to PySpark.</span>
<span class="sd"> These kwargs are specific to PySpark&#39;s CSV options to pass. Check</span>
<span class="sd"> the options in PySpark&#39;s API documentation for spark.write.csv(...).</span>
<span class="sd"> It has higher priority and overwrites all other options.</span>
<span class="sd"> This parameter only works when `path` is specified.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> str or None</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> read_csv</span>
<span class="sd"> DataFrame.to_delta</span>
<span class="sd"> DataFrame.to_table</span>
<span class="sd"> DataFrame.to_parquet</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame(dict(</span>
<span class="sd"> ... date=list(pd.date_range(&#39;2012-1-1 12:00:00&#39;, periods=3, freq=&#39;M&#39;)),</span>
<span class="sd"> ... country=[&#39;KR&#39;, &#39;US&#39;, &#39;JP&#39;],</span>
<span class="sd"> ... code=[1, 2 ,3]), columns=[&#39;date&#39;, &#39;country&#39;, &#39;code&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.sort_values(by=&quot;date&quot;) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE</span>
<span class="sd"> date country code</span>
<span class="sd"> ... 2012-01-31 12:00:00 KR 1</span>
<span class="sd"> ... 2012-02-29 12:00:00 US 2</span>
<span class="sd"> ... 2012-03-31 12:00:00 JP 3</span>
<span class="sd"> &gt;&gt;&gt; print(df.to_csv()) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> date,country,code</span>
<span class="sd"> 2012-01-31 12:00:00,KR,1</span>
<span class="sd"> 2012-02-29 12:00:00,US,2</span>
<span class="sd"> 2012-03-31 12:00:00,JP,3</span>
<span class="sd"> &gt;&gt;&gt; df.cummax().to_csv(path=r&#39;%s/to_csv/foo.csv&#39; % path, num_files=1)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_csv(</span>
<span class="sd"> ... path=r&#39;%s/to_csv/foo.csv&#39; % path</span>
<span class="sd"> ... ).sort_values(by=&quot;date&quot;) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE</span>
<span class="sd"> date country code</span>
<span class="sd"> ... 2012-01-31 12:00:00 KR 1</span>
<span class="sd"> ... 2012-02-29 12:00:00 US 2</span>
<span class="sd"> ... 2012-03-31 12:00:00 US 3</span>
<span class="sd"> In case of Series,</span>
<span class="sd"> &gt;&gt;&gt; print(df.date.to_csv()) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> date</span>
<span class="sd"> 2012-01-31 12:00:00</span>
<span class="sd"> 2012-02-29 12:00:00</span>
<span class="sd"> 2012-03-31 12:00:00</span>
<span class="sd"> &gt;&gt;&gt; df.date.to_csv(path=r&#39;%s/to_csv/foo.csv&#39; % path, num_files=1)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_csv(</span>
<span class="sd"> ... path=r&#39;%s/to_csv/foo.csv&#39; % path</span>
<span class="sd"> ... ).sort_values(by=&quot;date&quot;) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE</span>
<span class="sd"> date</span>
<span class="sd"> ... 2012-01-31 12:00:00</span>
<span class="sd"> ... 2012-02-29 12:00:00</span>
<span class="sd"> ... 2012-03-31 12:00:00</span>
<span class="sd"> You can preserve the index in the roundtrip as below.</span>
<span class="sd"> &gt;&gt;&gt; df.set_index(&quot;country&quot;, append=True, inplace=True)</span>
<span class="sd"> &gt;&gt;&gt; df.date.to_csv(</span>
<span class="sd"> ... path=r&#39;%s/to_csv/bar.csv&#39; % path,</span>
<span class="sd"> ... num_files=1,</span>
<span class="sd"> ... index_col=[&quot;index1&quot;, &quot;index2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; ps.read_csv(</span>
<span class="sd"> ... path=r&#39;%s/to_csv/bar.csv&#39; % path, index_col=[&quot;index1&quot;, &quot;index2&quot;]</span>
<span class="sd"> ... ).sort_values(by=&quot;date&quot;) # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE</span>
<span class="sd"> date</span>
<span class="sd"> index1 index2</span>
<span class="sd"> ... ... 2012-01-31 12:00:00</span>
<span class="sd"> ... ... 2012-02-29 12:00:00</span>
<span class="sd"> ... ... 2012-03-31 12:00:00</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">path</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="c1"># If path is none, just collect and use pandas&#39;s to_csv.</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_to_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">to_csv</span><span class="p">(</span>
<span class="kc">None</span><span class="p">,</span>
<span class="n">sep</span><span class="o">=</span><span class="n">sep</span><span class="p">,</span>
<span class="n">na_rep</span><span class="o">=</span><span class="n">na_rep</span><span class="p">,</span>
<span class="n">columns</span><span class="o">=</span><span class="n">columns</span><span class="p">,</span>
<span class="n">header</span><span class="o">=</span><span class="n">header</span><span class="p">,</span>
<span class="n">quotechar</span><span class="o">=</span><span class="n">quotechar</span><span class="p">,</span>
<span class="n">date_format</span><span class="o">=</span><span class="n">date_format</span><span class="p">,</span>
<span class="n">escapechar</span><span class="o">=</span><span class="n">escapechar</span><span class="p">,</span>
<span class="n">index</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span>
<span class="k">if</span> <span class="n">columns</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">columns</span><span class="p">:</span>
<span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">col</span><span class="p">):</span>
<span class="n">label</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">label</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="p">(</span><span class="n">col</span><span class="p">,))</span>
<span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span>
<span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">index_col</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">index_cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">index_col</span><span class="p">]</span>
<span class="k">elif</span> <span class="n">index_col</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">index_cols</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">index_cols</span> <span class="o">=</span> <span class="n">index_col</span>
<span class="k">if</span> <span class="n">header</span> <span class="ow">is</span> <span class="kc">True</span> <span class="ow">and</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> <span class="o">&gt;</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;to_csv only support one-level index column now&quot;</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">header</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">to_spark</span><span class="p">(</span><span class="n">index_col</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">index_cols</span><span class="p">]</span>
<span class="o">+</span> <span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="nb">str</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span>
<span class="n">new_name</span>
<span class="p">)</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">new_name</span><span class="p">)</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="nb">zip</span><span class="p">(</span><span class="n">column_labels</span><span class="p">,</span> <span class="n">header</span><span class="p">))</span>
<span class="p">]</span>
<span class="p">)</span>
<span class="n">header</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">to_spark</span><span class="p">(</span><span class="n">index_col</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">index_cols</span><span class="p">]</span>
<span class="o">+</span> <span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="nb">str</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">if</span> <span class="n">label</span> <span class="ow">is</span> <span class="kc">None</span> <span class="k">else</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">))</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">column_labels</span><span class="p">)</span>
<span class="p">]</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">num_files</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;`num_files` has been deprecated and might be removed in a future version. &quot;</span>
<span class="s2">&quot;Use `DataFrame.spark.repartition` instead.&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="n">num_files</span><span class="p">)</span>
<span class="n">mode</span> <span class="o">=</span> <span class="n">validate_mode</span><span class="p">(</span><span class="n">mode</span><span class="p">)</span>
<span class="n">builder</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">mode</span><span class="p">(</span><span class="n">mode</span><span class="p">)</span>
<span class="k">if</span> <span class="n">partition_cols</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">builder</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="n">partition_cols</span><span class="p">)</span>
<span class="n">builder</span><span class="o">.</span><span class="n">_set_opts</span><span class="p">(</span>
<span class="n">sep</span><span class="o">=</span><span class="n">sep</span><span class="p">,</span>
<span class="n">nullValue</span><span class="o">=</span><span class="n">na_rep</span><span class="p">,</span>
<span class="n">header</span><span class="o">=</span><span class="n">header</span><span class="p">,</span>
<span class="n">quote</span><span class="o">=</span><span class="n">quotechar</span><span class="p">,</span>
<span class="n">dateFormat</span><span class="o">=</span><span class="n">date_format</span><span class="p">,</span>
<span class="n">charToEscapeQuoteEscaping</span><span class="o">=</span><span class="n">escapechar</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">builder</span><span class="o">.</span><span class="n">options</span><span class="p">(</span><span class="o">**</span><span class="n">options</span><span class="p">)</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;csv&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="k">def</span> <span class="nf">to_json</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">path</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">compression</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;uncompressed&quot;</span><span class="p">,</span>
<span class="n">num_files</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">mode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;w&quot;</span><span class="p">,</span>
<span class="n">orient</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;records&quot;</span><span class="p">,</span>
<span class="n">lines</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">partition_cols</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">**</span><span class="n">options</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Convert the object to a JSON string.</span>
<span class="sd"> .. note:: pandas-on-Spark `to_json` writes files to a path or URI. Unlike pandas&#39;,</span>
<span class="sd"> pandas-on-Spark respects HDFS&#39;s property such as &#39;fs.default.name&#39;.</span>
<span class="sd"> .. note:: pandas-on-Spark writes JSON files into the directory, `path`, and writes</span>
<span class="sd"> multiple `part-...` files in the directory when `path` is specified.</span>
<span class="sd"> This behavior was inherited from Apache Spark. The number of partitions can</span>
<span class="sd"> be controlled by `num_files`. This is deprecated.</span>
<span class="sd"> Use `DataFrame.spark.repartition` instead.</span>
<span class="sd"> .. note:: output JSON format is different from pandas&#39;. It always uses `orient=&#39;records&#39;`</span>
<span class="sd"> for its output. This behavior might have to change soon.</span>
<span class="sd"> .. note:: Set `ignoreNullFields` keyword argument to `True` to omit `None` or `NaN` values</span>
<span class="sd"> when writing JSON objects. It works only when `path` is provided.</span>
<span class="sd"> Note NaN&#39;s and None will be converted to null and datetime objects</span>
<span class="sd"> will be converted to UNIX timestamps.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> path: string, optional</span>
<span class="sd"> File path. If not specified, the result is returned as</span>
<span class="sd"> a string.</span>
<span class="sd"> lines: bool, default True</span>
<span class="sd"> If ‘orient’ is ‘records’ write out line delimited JSON format.</span>
<span class="sd"> Will throw ValueError if incorrect ‘orient’ since others are not</span>
<span class="sd"> list like. It should be always True for now.</span>
<span class="sd"> orient: str, default &#39;records&#39;</span>
<span class="sd"> It should be always &#39;records&#39; for now.</span>
<span class="sd"> compression: {&#39;gzip&#39;, &#39;bz2&#39;, &#39;xz&#39;, None}</span>
<span class="sd"> A string representing the compression to use in the output file,</span>
<span class="sd"> only used when the first argument is a filename. By default, the</span>
<span class="sd"> compression is inferred from the filename.</span>
<span class="sd"> num_files: the number of partitions to be written in `path` directory when</span>
<span class="sd"> this is a path. This is deprecated. Use `DataFrame.spark.repartition` instead.</span>
<span class="sd"> mode: str</span>
<span class="sd"> Python write mode, default &#39;w&#39;.</span>
<span class="sd"> .. note:: mode can accept the strings for Spark writing mode.</span>
<span class="sd"> Such as &#39;append&#39;, &#39;overwrite&#39;, &#39;ignore&#39;, &#39;error&#39;, &#39;errorifexists&#39;.</span>
<span class="sd"> - &#39;append&#39; (equivalent to &#39;a&#39;): Append the new data to existing data.</span>
<span class="sd"> - &#39;overwrite&#39; (equivalent to &#39;w&#39;): Overwrite existing data.</span>
<span class="sd"> - &#39;ignore&#39;: Silently ignore this operation if data already exists.</span>
<span class="sd"> - &#39;error&#39; or &#39;errorifexists&#39;: Throw an exception if data already exists.</span>
<span class="sd"> partition_cols: str or list of str, optional, default None</span>
<span class="sd"> Names of partitioning columns</span>
<span class="sd"> index_col: str or list of str, optional, default: None</span>
<span class="sd"> Column names to be used in Spark to represent pandas-on-Spark&#39;s index. The index name</span>
<span class="sd"> in pandas-on-Spark is ignored. By default, the index is always lost.</span>
<span class="sd"> options: keyword arguments for additional options specific to PySpark.</span>
<span class="sd"> It is specific to PySpark&#39;s JSON options to pass. Check</span>
<span class="sd"> the options in PySpark&#39;s API documentation for `spark.write.json(...)`.</span>
<span class="sd"> It has a higher priority and overwrites all other options.</span>
<span class="sd"> This parameter only works when `path` is specified.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> str or None</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame([[&#39;a&#39;, &#39;b&#39;], [&#39;c&#39;, &#39;d&#39;]],</span>
<span class="sd"> ... columns=[&#39;col 1&#39;, &#39;col 2&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.to_json()</span>
<span class="sd"> &#39;[{&quot;col 1&quot;:&quot;a&quot;,&quot;col 2&quot;:&quot;b&quot;},{&quot;col 1&quot;:&quot;c&quot;,&quot;col 2&quot;:&quot;d&quot;}]&#39;</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;col 1&#39;].to_json()</span>
<span class="sd"> &#39;[{&quot;col 1&quot;:&quot;a&quot;},{&quot;col 1&quot;:&quot;c&quot;}]&#39;</span>
<span class="sd"> &gt;&gt;&gt; df.to_json(path=r&#39;%s/to_json/foo.json&#39; % path, num_files=1)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_json(</span>
<span class="sd"> ... path=r&#39;%s/to_json/foo.json&#39; % path</span>
<span class="sd"> ... ).sort_values(by=&quot;col 1&quot;)</span>
<span class="sd"> col 1 col 2</span>
<span class="sd"> 0 a b</span>
<span class="sd"> 1 c d</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;col 1&#39;].to_json(path=r&#39;%s/to_json/foo.json&#39; % path, num_files=1, index_col=&quot;index&quot;)</span>
<span class="sd"> &gt;&gt;&gt; ps.read_json(</span>
<span class="sd"> ... path=r&#39;%s/to_json/foo.json&#39; % path, index_col=&quot;index&quot;</span>
<span class="sd"> ... ).sort_values(by=&quot;col 1&quot;) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> col 1</span>
<span class="sd"> index</span>
<span class="sd"> 0 a</span>
<span class="sd"> 1 c</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="s2">&quot;options&quot;</span> <span class="ow">in</span> <span class="n">options</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">),</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">options</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">options</span> <span class="o">=</span> <span class="n">options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;options&quot;</span><span class="p">)</span>
<span class="n">default_options</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span><span class="s2">&quot;ignoreNullFields&quot;</span><span class="p">:</span> <span class="kc">False</span><span class="p">}</span>
<span class="n">options</span> <span class="o">=</span> <span class="p">{</span><span class="o">**</span><span class="n">default_options</span><span class="p">,</span> <span class="o">**</span><span class="n">options</span><span class="p">}</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">lines</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;lines=False is not implemented yet.&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">orient</span> <span class="o">!=</span> <span class="s2">&quot;records&quot;</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;orient=&#39;records&#39; is supported only for now.&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">path</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="c1"># If path is none, just collect and use pandas&#39;s to_json.</span>
<span class="n">psdf_or_ser</span> <span class="o">=</span> <span class="bp">self</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">psdf_or_ser</span><span class="o">.</span><span class="n">_to_pandas</span><span class="p">()</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span>
<span class="c1"># To make the format consistent and readable by `read_json`, convert it to pandas&#39; and</span>
<span class="c1"># use &#39;records&#39; orient for now.</span>
<span class="k">return</span> <span class="n">pdf</span><span class="o">.</span><span class="n">to_json</span><span class="p">(</span><span class="n">orient</span><span class="o">=</span><span class="s2">&quot;records&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">assert</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">to_spark</span><span class="p">(</span><span class="n">index_col</span><span class="o">=</span><span class="n">index_col</span><span class="p">)</span>
<span class="k">if</span> <span class="n">num_files</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;`num_files` has been deprecated and might be removed in a future version. &quot;</span>
<span class="s2">&quot;Use `DataFrame.spark.repartition` instead.&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">repartition</span><span class="p">(</span><span class="n">num_files</span><span class="p">)</span>
<span class="n">mode</span> <span class="o">=</span> <span class="n">validate_mode</span><span class="p">(</span><span class="n">mode</span><span class="p">)</span>
<span class="n">builder</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">write</span><span class="o">.</span><span class="n">mode</span><span class="p">(</span><span class="n">mode</span><span class="p">)</span>
<span class="k">if</span> <span class="n">partition_cols</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">builder</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="n">partition_cols</span><span class="p">)</span>
<span class="n">builder</span><span class="o">.</span><span class="n">_set_opts</span><span class="p">(</span><span class="n">compression</span><span class="o">=</span><span class="n">compression</span><span class="p">)</span>
<span class="n">builder</span><span class="o">.</span><span class="n">options</span><span class="p">(</span><span class="o">**</span><span class="n">options</span><span class="p">)</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="s2">&quot;json&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="n">path</span><span class="p">)</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="k">def</span> <span class="nf">to_excel</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">excel_writer</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">ExcelWriter</span><span class="p">],</span>
<span class="n">sheet_name</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;Sheet1&quot;</span><span class="p">,</span>
<span class="n">na_rep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;&quot;</span><span class="p">,</span>
<span class="n">float_format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">header</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">index_label</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">startrow</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
<span class="n">startcol</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
<span class="n">engine</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">merge_cells</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">inf_rep</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;inf&quot;</span><span class="p">,</span>
<span class="n">freeze_panes</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">int</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Write object to an Excel sheet.</span>
<span class="sd"> .. note:: This method should only be used if the resulting DataFrame is expected</span>
<span class="sd"> to be small, as all the data is loaded into the driver&#39;s memory.</span>
<span class="sd"> To write a single object to an Excel .xlsx file it is only necessary to</span>
<span class="sd"> specify a target file name. To write to multiple sheets it is necessary to</span>
<span class="sd"> create an `ExcelWriter` object with a target file name, and specify a sheet</span>
<span class="sd"> in the file to write to.</span>
<span class="sd"> Multiple sheets may be written to by specifying unique `sheet_name`.</span>
<span class="sd"> With all data written to the file it is necessary to save the changes.</span>
<span class="sd"> Note that creating an `ExcelWriter` object with a file name that already</span>
<span class="sd"> exists will result in the contents of the existing file being erased.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> excel_writer: str or ExcelWriter object</span>
<span class="sd"> File path or existing ExcelWriter.</span>
<span class="sd"> sheet_name: str, default &#39;Sheet1&#39;</span>
<span class="sd"> Name of sheet which will contain DataFrame.</span>
<span class="sd"> na_rep: str, default &#39;&#39;</span>
<span class="sd"> Missing data representation.</span>
<span class="sd"> float_format: str, optional</span>
<span class="sd"> Format string for floating point numbers. For example</span>
<span class="sd"> ``float_format=&quot;%%.2f&quot;`` will format 0.1234 to 0.12.</span>
<span class="sd"> columns: sequence or list of str, optional</span>
<span class="sd"> Columns to write.</span>
<span class="sd"> header: bool or list of str, default True</span>
<span class="sd"> Write out the column names. If a list of string is given it is</span>
<span class="sd"> assumed to be aliases for the column names.</span>
<span class="sd"> index: bool, default True</span>
<span class="sd"> Write row names (index).</span>
<span class="sd"> index_label: str or sequence, optional</span>
<span class="sd"> Column label for index column(s) if desired. If not specified, and</span>
<span class="sd"> `header` and `index` are True, then the index names are used. A</span>
<span class="sd"> sequence should be given if the DataFrame uses MultiIndex.</span>
<span class="sd"> startrow: int, default 0</span>
<span class="sd"> Upper left cell row to dump data frame.</span>
<span class="sd"> startcol: int, default 0</span>
<span class="sd"> Upper left cell column to dump data frame.</span>
<span class="sd"> engine: str, optional</span>
<span class="sd"> Write engine to use, &#39;openpyxl&#39; or &#39;xlsxwriter&#39;. You can also set this</span>
<span class="sd"> via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``, and</span>
<span class="sd"> ``io.excel.xlsm.writer``.</span>
<span class="sd"> merge_cells: bool, default True</span>
<span class="sd"> Write MultiIndex and Hierarchical Rows as merged cells.</span>
<span class="sd"> inf_rep: str, default &#39;inf&#39;</span>
<span class="sd"> Representation for infinity (there is no native representation for</span>
<span class="sd"> infinity in Excel).</span>
<span class="sd"> freeze_panes: tuple of int (length 2), optional</span>
<span class="sd"> Specifies the one-based bottommost row and rightmost column that</span>
<span class="sd"> is to be frozen.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Once a workbook has been saved it is not possible write further data</span>
<span class="sd"> without rewriting the whole workbook.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> read_excel: Read Excel file.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Create, write to, and save a workbook:</span>
<span class="sd"> &gt;&gt;&gt; df1 = ps.DataFrame([[&#39;a&#39;, &#39;b&#39;], [&#39;c&#39;, &#39;d&#39;]],</span>
<span class="sd"> ... index=[&#39;row 1&#39;, &#39;row 2&#39;],</span>
<span class="sd"> ... columns=[&#39;col 1&#39;, &#39;col 2&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df1.to_excel(&quot;output.xlsx&quot;) # doctest: +SKIP</span>
<span class="sd"> To specify the sheet name:</span>
<span class="sd"> &gt;&gt;&gt; df1.to_excel(&quot;output.xlsx&quot;) # doctest: +SKIP</span>
<span class="sd"> &gt;&gt;&gt; df1.to_excel(&quot;output.xlsx&quot;,</span>
<span class="sd"> ... sheet_name=&#39;Sheet_name_1&#39;) # doctest: +SKIP</span>
<span class="sd"> If you wish to write to more than one sheet in the workbook, it is</span>
<span class="sd"> necessary to specify an ExcelWriter object:</span>
<span class="sd"> &gt;&gt;&gt; with pd.ExcelWriter(&#39;output.xlsx&#39;) as writer: # doctest: +SKIP</span>
<span class="sd"> ... df1.to_excel(writer, sheet_name=&#39;Sheet_name_1&#39;)</span>
<span class="sd"> ... df2.to_excel(writer, sheet_name=&#39;Sheet_name_2&#39;)</span>
<span class="sd"> To set the library that is used to write the Excel file,</span>
<span class="sd"> you can pass the `engine` keyword (the default engine is</span>
<span class="sd"> automatically chosen depending on the file extension):</span>
<span class="sd"> &gt;&gt;&gt; df1.to_excel(&#39;output1.xlsx&#39;, engine=&#39;xlsxwriter&#39;) # doctest: +SKIP</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;`to_excel` loads all data into the driver&#39;s memory. &quot;</span>
<span class="s2">&quot;It should only be used if the resulting DataFrame is expected to be small.&quot;</span>
<span class="p">)</span>
<span class="c1"># Make sure locals() call is at the top of the function so we don&#39;t capture local variables.</span>
<span class="n">args</span> <span class="o">=</span> <span class="nb">locals</span><span class="p">()</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="n">f</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="o">.</span><span class="n">to_excel</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="n">f</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="o">.</span><span class="n">to_excel</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Constructor expects DataFrame or Series; however, &quot;</span> <span class="s2">&quot;got [</span><span class="si">%s</span><span class="s2">]&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="bp">self</span><span class="p">,)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">validate_arguments_and_invoke_function</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">(),</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_excel</span><span class="p">,</span> <span class="n">f</span><span class="p">,</span> <span class="n">args</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">to_hdf</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">path_or_buf</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">HDFStore</span><span class="p">],</span>
<span class="n">key</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">mode</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;a&quot;</span><span class="p">,</span>
<span class="n">complevel</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">complib</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">append</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="nb">format</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">min_itemsize</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">int</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">nan_rep</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">dropna</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">data_columns</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">bool</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">errors</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;strict&quot;</span><span class="p">,</span>
<span class="n">encoding</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;UTF-8&quot;</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Write the contained data to an HDF5 file using HDFStore.</span>
<span class="sd"> .. note:: This method should only be used if the resulting DataFrame is expected</span>
<span class="sd"> to be small, as all the data is loaded into the driver&#39;s memory.</span>
<span class="sd"> .. versionadded:: 4.0.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> path_or_buf : str or pandas.HDFStore</span>
<span class="sd"> File path or HDFStore object.</span>
<span class="sd"> key : str</span>
<span class="sd"> Identifier for the group in the store.</span>
<span class="sd"> mode : {&#39;a&#39;, &#39;w&#39;, &#39;r+&#39;}, default &#39;a&#39;</span>
<span class="sd"> Mode to open file:</span>
<span class="sd"> - &#39;w&#39;: write, a new file is created (an existing file with</span>
<span class="sd"> the same name would be deleted).</span>
<span class="sd"> - &#39;a&#39;: append, an existing file is opened for reading and</span>
<span class="sd"> writing, and if the file does not exist it is created.</span>
<span class="sd"> - &#39;r+&#39;: similar to &#39;a&#39;, but the file must already exist.</span>
<span class="sd"> complevel : {0-9}, default None</span>
<span class="sd"> Specifies a compression level for data.</span>
<span class="sd"> A value of 0 or None disables compression.</span>
<span class="sd"> complib : {&#39;zlib&#39;, &#39;lzo&#39;, &#39;bzip2&#39;, &#39;blosc&#39;}, default &#39;zlib&#39;</span>
<span class="sd"> Specifies the compression library to be used.</span>
<span class="sd"> These additional compressors for Blosc are supported</span>
<span class="sd"> (default if no compressor specified: &#39;blosc:blosclz&#39;):</span>
<span class="sd"> {&#39;blosc:blosclz&#39;, &#39;blosc:lz4&#39;, &#39;blosc:lz4hc&#39;, &#39;blosc:snappy&#39;,</span>
<span class="sd"> &#39;blosc:zlib&#39;, &#39;blosc:zstd&#39;}.</span>
<span class="sd"> Specifying a compression library which is not available issues</span>
<span class="sd"> a ValueError.</span>
<span class="sd"> append : bool, default False</span>
<span class="sd"> For Table formats, append the input data to the existing.</span>
<span class="sd"> format : {&#39;fixed&#39;, &#39;table&#39;, None}, default &#39;fixed&#39;</span>
<span class="sd"> Possible values:</span>
<span class="sd"> - &#39;fixed&#39;: Fixed format. Fast writing/reading. Not-appendable,</span>
<span class="sd"> nor searchable.</span>
<span class="sd"> - &#39;table&#39;: Table format. Write as a PyTables Table structure</span>
<span class="sd"> which may perform worse but allow more flexible operations</span>
<span class="sd"> like searching / selecting subsets of the data.</span>
<span class="sd"> - If None, pd.get_option(&#39;io.hdf.default_format&#39;) is checked,</span>
<span class="sd"> followed by fallback to &quot;fixed&quot;.</span>
<span class="sd"> index : bool, default True</span>
<span class="sd"> Write DataFrame index as a column.</span>
<span class="sd"> min_itemsize : dict or int, optional</span>
<span class="sd"> Map column names to minimum string sizes for columns.</span>
<span class="sd"> nan_rep : Any, optional</span>
<span class="sd"> How to represent null values as str.</span>
<span class="sd"> Not allowed with append=True.</span>
<span class="sd"> dropna : bool, default False, optional</span>
<span class="sd"> Remove missing values.</span>
<span class="sd"> data_columns : list of columns or True, optional</span>
<span class="sd"> List of columns to create as indexed data columns for on-disk</span>
<span class="sd"> queries, or True to use all columns. By default only the axes</span>
<span class="sd"> of the object are indexed. Applicable only to format=&#39;table&#39;.</span>
<span class="sd"> errors : str, default &#39;strict&#39;</span>
<span class="sd"> Specifies how encoding and decoding errors are to be handled.</span>
<span class="sd"> See the errors argument for :func:`open` for a full list</span>
<span class="sd"> of options.</span>
<span class="sd"> encoding : str, default &quot;UTF-8&quot;</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.to_orc : Write a DataFrame to the binary orc format.</span>
<span class="sd"> DataFrame.to_parquet : Write a DataFrame to the binary parquet format.</span>
<span class="sd"> DataFrame.to_csv : Write out to a csv file.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [1, 2, 3], &#39;B&#39;: [4, 5, 6]},</span>
<span class="sd"> ... index=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;]) # doctest: +SKIP</span>
<span class="sd"> &gt;&gt;&gt; df.to_hdf(&#39;data.h5&#39;, key=&#39;df&#39;, mode=&#39;w&#39;) # doctest: +SKIP</span>
<span class="sd"> We can add another object to the same file:</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([1, 2, 3, 4]) # doctest: +SKIP</span>
<span class="sd"> &gt;&gt;&gt; s.to_hdf(&#39;data.h5&#39;, key=&#39;s&#39;) # doctest: +SKIP</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;`to_hdf` loads all data into the driver&#39;s memory. &quot;</span>
<span class="s2">&quot;It should only be used if the resulting DataFrame is expected to be small.&quot;</span>
<span class="p">)</span>
<span class="c1"># Make sure locals() call is at the top of the function so we don&#39;t capture local variables.</span>
<span class="n">args</span> <span class="o">=</span> <span class="nb">locals</span><span class="p">()</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="n">f</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="o">.</span><span class="n">to_hdf</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="n">f</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="o">.</span><span class="n">to_hdf</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Constructor expects DataFrame or Series; however, &quot;</span> <span class="s2">&quot;got [</span><span class="si">%s</span><span class="s2">]&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="bp">self</span><span class="p">,)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">validate_arguments_and_invoke_function</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">(),</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_hdf</span><span class="p">,</span> <span class="n">f</span><span class="p">,</span> <span class="n">args</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">mean</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the mean of the values.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis: {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> skipna: bool, default True</span>
<span class="sd"> Exclude NA/null values when computing the result.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supported including NA/null values.</span>
<span class="sd"> numeric_only: bool, default None</span>
<span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span>
<span class="sd"> is mainly for pandas compatibility.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> mean: scalar for a Series, and a Series for a DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 2, 3, np.nan], &#39;b&#39;: [0.1, 0.2, 0.3, np.nan]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df.mean()</span>
<span class="sd"> a 2.0</span>
<span class="sd"> b 0.2</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.mean(axis=1)</span>
<span class="sd"> 0 0.55</span>
<span class="sd"> 1 1.10</span>
<span class="sd"> 2 1.65</span>
<span class="sd"> 3 NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].mean()</span>
<span class="sd"> 2.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">def</span> <span class="nf">mean</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span>
<span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">mean</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">mean</span><span class="p">,</span>
<span class="n">name</span><span class="o">=</span><span class="s2">&quot;mean&quot;</span><span class="p">,</span>
<span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span>
<span class="n">skipna</span><span class="o">=</span><span class="n">skipna</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">sum</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">min_count</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the sum of the values.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis: {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> skipna: bool, default True</span>
<span class="sd"> Exclude NA/null values when computing the result.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Added *skipna* to exclude.</span>
<span class="sd"> numeric_only: bool, default None</span>
<span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span>
<span class="sd"> is mainly for pandas compatibility.</span>
<span class="sd"> min_count: int, default 0</span>
<span class="sd"> The required number of valid values to perform the operation. If fewer than</span>
<span class="sd"> ``min_count`` non-NA values are present the result will be NA.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> sum: scalar for a Series, and a Series for a DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 2, 3, np.nan], &#39;b&#39;: [0.1, np.nan, 0.3, np.nan]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df.sum()</span>
<span class="sd"> a 6.0</span>
<span class="sd"> b 0.4</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.sum(axis=1)</span>
<span class="sd"> 0 1.1</span>
<span class="sd"> 1 2.0</span>
<span class="sd"> 2 3.3</span>
<span class="sd"> 3 0.0</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.sum(min_count=3)</span>
<span class="sd"> a 6.0</span>
<span class="sd"> b NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.sum(axis=1, min_count=1)</span>
<span class="sd"> 0 1.1</span>
<span class="sd"> 1 2.0</span>
<span class="sd"> 2 3.3</span>
<span class="sd"> 3 NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].sum()</span>
<span class="sd"> 6.0</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].sum(min_count=3)</span>
<span class="sd"> 6.0</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;b&#39;].sum(min_count=3)</span>
<span class="sd"> nan</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">axis</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;The behavior of DataFrame.sum with axis=None is deprecated, &quot;</span>
<span class="s2">&quot;in a future version this will reduce over both axes and return a scalar. &quot;</span>
<span class="s2">&quot;To retain the old behavior, pass axis=0 (or do not pass axis)&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">elif</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">True</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">def</span> <span class="nf">sum</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span>
<span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">spark_column</span><span class="p">),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="nb">sum</span><span class="p">,</span>
<span class="n">name</span><span class="o">=</span><span class="s2">&quot;sum&quot;</span><span class="p">,</span>
<span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span>
<span class="n">min_count</span><span class="o">=</span><span class="n">min_count</span><span class="p">,</span>
<span class="n">skipna</span><span class="o">=</span><span class="n">skipna</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">product</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">min_count</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the product of the values.</span>
<span class="sd"> .. note:: unlike pandas&#39;, pandas-on-Spark&#39;s emulates product by ``exp(sum(log(...)))``</span>
<span class="sd"> trick. Therefore, it only works for positive numbers.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis: {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> skipna: bool, default True</span>
<span class="sd"> Exclude NA/null values when computing the result.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supported including NA/null values.</span>
<span class="sd"> numeric_only: bool, default None</span>
<span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span>
<span class="sd"> is mainly for pandas compatibility.</span>
<span class="sd"> min_count: int, default 0</span>
<span class="sd"> The required number of valid values to perform the operation. If fewer than</span>
<span class="sd"> ``min_count`` non-NA values are present the result will be NA.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> Non-numeric type column is not included to the result.</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame({&#39;A&#39;: [1, 2, 3, 4, 5],</span>
<span class="sd"> ... &#39;B&#39;: [10, 20, 30, 40, 50],</span>
<span class="sd"> ... &#39;C&#39;: [&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, &#39;d&#39;, &#39;e&#39;]})</span>
<span class="sd"> &gt;&gt;&gt; psdf</span>
<span class="sd"> A B C</span>
<span class="sd"> 0 1 10 a</span>
<span class="sd"> 1 2 20 b</span>
<span class="sd"> 2 3 30 c</span>
<span class="sd"> 3 4 40 d</span>
<span class="sd"> 4 5 50 e</span>
<span class="sd"> &gt;&gt;&gt; psdf.prod()</span>
<span class="sd"> A 120</span>
<span class="sd"> B 12000000</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> If there is no numeric type columns, returns empty Series.</span>
<span class="sd"> &gt;&gt;&gt; ps.DataFrame({&quot;key&quot;: [&#39;a&#39;, &#39;b&#39;, &#39;c&#39;], &quot;val&quot;: [&#39;x&#39;, &#39;y&#39;, &#39;z&#39;]}).prod() # doctest: +SKIP</span>
<span class="sd"> Series([], dtype: float64)</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1, 2, 3, 4, 5]).prod()</span>
<span class="sd"> 120</span>
<span class="sd"> By default, the product of an empty or all-NA Series is ``1``</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([]).prod() # doctest: +SKIP</span>
<span class="sd"> 1.0</span>
<span class="sd"> This can be controlled with the ``min_count`` parameter</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([]).prod(min_count=1) # doctest: +SKIP</span>
<span class="sd"> nan</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">axis</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;The behavior of DataFrame.product with axis=None is deprecated, &quot;</span>
<span class="s2">&quot;in a future version this will reduce over both axes and return a scalar. &quot;</span>
<span class="s2">&quot;To retain the old behavior, pass axis=0 (or do not pass axis)&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">elif</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">True</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">def</span> <span class="nf">prod</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span>
<span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">SF</span><span class="o">.</span><span class="n">product</span><span class="p">(</span><span class="n">spark_column</span><span class="p">,</span> <span class="n">skipna</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">prod</span><span class="p">,</span>
<span class="n">name</span><span class="o">=</span><span class="s2">&quot;prod&quot;</span><span class="p">,</span>
<span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span>
<span class="n">min_count</span><span class="o">=</span><span class="n">min_count</span><span class="p">,</span>
<span class="n">skipna</span><span class="o">=</span><span class="n">skipna</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">prod</span> <span class="o">=</span> <span class="n">product</span>
<span class="k">def</span> <span class="nf">skew</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return unbiased skew normalized by N-1.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis: {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> skipna: bool, default True</span>
<span class="sd"> Exclude NA/null values when computing the result.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supported including NA/null values.</span>
<span class="sd"> numeric_only: bool, default None</span>
<span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span>
<span class="sd"> is mainly for pandas compatibility.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> skew: scalar for a Series, and a Series for a DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 2, 3, np.nan], &#39;b&#39;: [0.1, 0.2, 0.3, np.nan]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df.skew()</span>
<span class="sd"> a 0.0</span>
<span class="sd"> b 0.0</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].skew()</span>
<span class="sd"> 0.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">def</span> <span class="nf">skew</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span>
<span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">SF</span><span class="o">.</span><span class="n">skew</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">skew</span><span class="p">,</span>
<span class="n">name</span><span class="o">=</span><span class="s2">&quot;skew&quot;</span><span class="p">,</span>
<span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span>
<span class="n">skipna</span><span class="o">=</span><span class="n">skipna</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">kurtosis</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return unbiased kurtosis using Fisher’s definition of kurtosis (kurtosis of normal == 0.0).</span>
<span class="sd"> Normalized by N-1.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis: {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> skipna: bool, default True</span>
<span class="sd"> Exclude NA/null values when computing the result.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supported including NA/null values.</span>
<span class="sd"> numeric_only: bool, default None</span>
<span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span>
<span class="sd"> is mainly for pandas compatibility.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> kurt: scalar for a Series, and a Series for a DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 2, 3, np.nan, 6], &#39;b&#39;: [0.1, 0.2, 0.3, np.nan, 0.8]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df.kurtosis()</span>
<span class="sd"> a 1.500000</span>
<span class="sd"> b 2.703924</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].kurtosis()</span>
<span class="sd"> 1.5</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">def</span> <span class="nf">kurtosis</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span>
<span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">SF</span><span class="o">.</span><span class="n">kurt</span><span class="p">(</span><span class="n">spark_column</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">kurtosis</span><span class="p">,</span>
<span class="n">name</span><span class="o">=</span><span class="s2">&quot;kurtosis&quot;</span><span class="p">,</span>
<span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span>
<span class="n">skipna</span><span class="o">=</span><span class="n">skipna</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">kurt</span> <span class="o">=</span> <span class="n">kurtosis</span>
<span class="k">def</span> <span class="nf">min</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the minimum of the values.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis: {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> skipna: bool, default True</span>
<span class="sd"> Exclude NA/null values when computing the result.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supported including NA/null values.</span>
<span class="sd"> numeric_only: bool, default None</span>
<span class="sd"> If True, include only float, int, boolean columns. This parameter is mainly for</span>
<span class="sd"> pandas compatibility. False is supported; however, the columns should</span>
<span class="sd"> be all numeric or all non-numeric.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> min: scalar for a Series, and a Series for a DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 2, 3, np.nan], &#39;b&#39;: [0.1, 0.2, 0.3, np.nan]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df.min()</span>
<span class="sd"> a 1.0</span>
<span class="sd"> b 0.1</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.min(axis=1)</span>
<span class="sd"> 0 0.1</span>
<span class="sd"> 1 0.2</span>
<span class="sd"> 2 0.3</span>
<span class="sd"> 3 NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].min()</span>
<span class="sd"> 1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">elif</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">True</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">),</span>
<span class="n">name</span><span class="o">=</span><span class="s2">&quot;min&quot;</span><span class="p">,</span>
<span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span>
<span class="n">skipna</span><span class="o">=</span><span class="n">skipna</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">max</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the maximum of the values.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis: {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> skipna: bool, default True</span>
<span class="sd"> Exclude NA/null values when computing the result.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supported including NA/null values.</span>
<span class="sd"> numeric_only: bool, default None</span>
<span class="sd"> If True, include only float, int, boolean columns. This parameter is mainly for</span>
<span class="sd"> pandas compatibility. False is supported; however, the columns should</span>
<span class="sd"> be all numeric or all non-numeric.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> max: scalar for a Series, and a Series for a DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 2, 3, np.nan], &#39;b&#39;: [0.1, 0.2, 0.3, np.nan]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df.max()</span>
<span class="sd"> a 3.0</span>
<span class="sd"> b 0.3</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.max(axis=1)</span>
<span class="sd"> 0 1.0</span>
<span class="sd"> 1 2.0</span>
<span class="sd"> 2 3.0</span>
<span class="sd"> 3 NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].max()</span>
<span class="sd"> 3.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">elif</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">True</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">psser</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">),</span>
<span class="n">name</span><span class="o">=</span><span class="s2">&quot;max&quot;</span><span class="p">,</span>
<span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span>
<span class="n">skipna</span><span class="o">=</span><span class="n">skipna</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">count</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Count non-NA cells for each column.</span>
<span class="sd"> The values `None`, `NaN` are considered NA.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis: {0 or ‘index’, 1 or ‘columns’}, default 0</span>
<span class="sd"> If 0 or ‘index’ counts are generated for each column. If 1 or ‘columns’ counts are</span>
<span class="sd"> generated for each row.</span>
<span class="sd"> numeric_only: bool, default False</span>
<span class="sd"> If True, include only float, int, boolean columns. This parameter is mainly for</span>
<span class="sd"> pandas compatibility.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> max: scalar for a Series, and a Series for a DataFrame.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.shape: Number of DataFrame rows and columns (including NA</span>
<span class="sd"> elements).</span>
<span class="sd"> DataFrame.isna: Boolean same-sized DataFrame showing places of NA</span>
<span class="sd"> elements.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Constructing DataFrame from a dictionary:</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&quot;Person&quot;:</span>
<span class="sd"> ... [&quot;John&quot;, &quot;Myla&quot;, &quot;Lewis&quot;, &quot;John&quot;, &quot;Myla&quot;],</span>
<span class="sd"> ... &quot;Age&quot;: [24., np.nan, 21., 33, 26],</span>
<span class="sd"> ... &quot;Single&quot;: [False, True, True, True, False]},</span>
<span class="sd"> ... columns=[&quot;Person&quot;, &quot;Age&quot;, &quot;Single&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> Person Age Single</span>
<span class="sd"> 0 John 24.0 False</span>
<span class="sd"> 1 Myla NaN True</span>
<span class="sd"> 2 Lewis 21.0 True</span>
<span class="sd"> 3 John 33.0 True</span>
<span class="sd"> 4 Myla 26.0 False</span>
<span class="sd"> Notice the uncounted NA values:</span>
<span class="sd"> &gt;&gt;&gt; df.count()</span>
<span class="sd"> Person 5</span>
<span class="sd"> Age 4</span>
<span class="sd"> Single 5</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; df.count(axis=1)</span>
<span class="sd"> 0 3</span>
<span class="sd"> 1 2</span>
<span class="sd"> 2 3</span>
<span class="sd"> 3 3</span>
<span class="sd"> 4 3</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;Person&#39;].count()</span>
<span class="sd"> 5</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;Age&#39;].count()</span>
<span class="sd"> 4</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">Frame</span><span class="o">.</span><span class="n">_count_expr</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;count&quot;</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">std</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">ddof</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return sample standard deviation.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis: {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> skipna: bool, default True</span>
<span class="sd"> Exclude NA/null values when computing the result.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supported including NA/null values.</span>
<span class="sd"> ddof: int, default 1</span>
<span class="sd"> Delta Degrees of Freedom. The divisor used in calculations is N - ddof,</span>
<span class="sd"> where N represents the number of elements.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supported including arbitary integers.</span>
<span class="sd"> numeric_only: bool, default None</span>
<span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span>
<span class="sd"> is mainly for pandas compatibility.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> std: scalar for a Series, and a Series for a DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 2, 3, np.nan], &#39;b&#39;: [0.1, 0.2, 0.3, np.nan]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df.std()</span>
<span class="sd"> a 1.0</span>
<span class="sd"> b 0.1</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.std(ddof=2)</span>
<span class="sd"> a 1.414214</span>
<span class="sd"> b 0.141421</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.std(axis=1)</span>
<span class="sd"> 0 0.636396</span>
<span class="sd"> 1 1.272792</span>
<span class="sd"> 2 1.909188</span>
<span class="sd"> 3 NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.std(ddof=0)</span>
<span class="sd"> a 0.816497</span>
<span class="sd"> b 0.081650</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].std()</span>
<span class="sd"> 1.0</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].std(ddof=0)</span>
<span class="sd"> 0.816496580927726</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].std(ddof=-1)</span>
<span class="sd"> 0.707106...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ddof</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;ddof must be integer&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">axis</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;The behavior of DataFrame.std with axis=None is deprecated, &quot;</span>
<span class="s2">&quot;in a future version this will reduce over both axes and return a scalar. &quot;</span>
<span class="s2">&quot;To retain the old behavior, pass axis=0 (or do not pass axis)&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">def</span> <span class="nf">std</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span>
<span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">SF</span><span class="o">.</span><span class="n">stddev</span><span class="p">(</span><span class="n">spark_column</span><span class="p">,</span> <span class="n">ddof</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">std</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;std&quot;</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="n">ddof</span><span class="o">=</span><span class="n">ddof</span><span class="p">,</span> <span class="n">skipna</span><span class="o">=</span><span class="n">skipna</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">var</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">ddof</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return unbiased variance.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis: {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> ddof: int, default 1</span>
<span class="sd"> Delta Degrees of Freedom. The divisor used in calculations is N - ddof,</span>
<span class="sd"> where N represents the number of elements.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supported including arbitary integers.</span>
<span class="sd"> numeric_only: bool, default None</span>
<span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span>
<span class="sd"> is mainly for pandas compatibility.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> var: scalar for a Series, and a Series for a DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 2, 3, np.nan], &#39;b&#39;: [0.1, 0.2, 0.3, np.nan]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df.var()</span>
<span class="sd"> a 1.00</span>
<span class="sd"> b 0.01</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.var(ddof=2)</span>
<span class="sd"> a 2.00</span>
<span class="sd"> b 0.02</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.var(axis=1)</span>
<span class="sd"> 0 0.405</span>
<span class="sd"> 1 1.620</span>
<span class="sd"> 2 3.645</span>
<span class="sd"> 3 NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.var(ddof=0)</span>
<span class="sd"> a 0.666667</span>
<span class="sd"> b 0.006667</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].var()</span>
<span class="sd"> 1.0</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].var(ddof=0)</span>
<span class="sd"> 0.6666666666666666</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].var(ddof=-2)</span>
<span class="sd"> 0.4</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ddof</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;ddof must be integer&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">axis</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;The behavior of DataFrame.var with axis=None is deprecated, &quot;</span>
<span class="s2">&quot;in a future version this will reduce over both axes and return a scalar. &quot;</span>
<span class="s2">&quot;To retain the old behavior, pass axis=0 (or do not pass axis)&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">def</span> <span class="nf">var</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span>
<span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">SF</span><span class="o">.</span><span class="n">var</span><span class="p">(</span><span class="n">spark_column</span><span class="p">,</span> <span class="n">ddof</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">var</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;var&quot;</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="n">ddof</span><span class="o">=</span><span class="n">ddof</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">median</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">accuracy</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10000</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the median of the values for the requested axis.</span>
<span class="sd"> .. note:: Unlike pandas&#39;, the median in pandas-on-Spark is an approximated median based upon</span>
<span class="sd"> approximate percentile computation because computing median across a large dataset</span>
<span class="sd"> is extremely expensive.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis: {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> skipna: bool, default True</span>
<span class="sd"> Exclude NA/null values when computing the result.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supported including NA/null values.</span>
<span class="sd"> numeric_only: bool, default None</span>
<span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span>
<span class="sd"> is mainly for pandas compatibility.</span>
<span class="sd"> accuracy: int, optional</span>
<span class="sd"> Default accuracy of approximation. Larger value means better accuracy.</span>
<span class="sd"> The relative error can be deduced by 1.0 / accuracy.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> median: scalar or Series</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({</span>
<span class="sd"> ... &#39;a&#39;: [24., 21., 25., 33., 26.], &#39;b&#39;: [1, 2, 3, 4, 5]}, columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> a b</span>
<span class="sd"> 0 24.0 1</span>
<span class="sd"> 1 21.0 2</span>
<span class="sd"> 2 25.0 3</span>
<span class="sd"> 3 33.0 4</span>
<span class="sd"> 4 26.0 5</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df.median()</span>
<span class="sd"> a 25.0</span>
<span class="sd"> b 3.0</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;a&#39;].median()</span>
<span class="sd"> 25.0</span>
<span class="sd"> &gt;&gt;&gt; (df[&#39;b&#39;] + 100).median()</span>
<span class="sd"> 103.0</span>
<span class="sd"> For multi-index columns,</span>
<span class="sd"> &gt;&gt;&gt; df.columns = pd.MultiIndex.from_tuples([(&#39;x&#39;, &#39;a&#39;), (&#39;y&#39;, &#39;b&#39;)])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> x y</span>
<span class="sd"> a b</span>
<span class="sd"> 0 24.0 1</span>
<span class="sd"> 1 21.0 2</span>
<span class="sd"> 2 25.0 3</span>
<span class="sd"> 3 33.0 4</span>
<span class="sd"> 4 26.0 5</span>
<span class="sd"> On a DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df.median()</span>
<span class="sd"> x a 25.0</span>
<span class="sd"> y b 3.0</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.median(axis=1)</span>
<span class="sd"> 0 12.5</span>
<span class="sd"> 1 11.5</span>
<span class="sd"> 2 14.0</span>
<span class="sd"> 3 18.5</span>
<span class="sd"> 4 15.5</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> On a Series:</span>
<span class="sd"> &gt;&gt;&gt; df[(&#39;x&#39;, &#39;a&#39;)].median()</span>
<span class="sd"> 25.0</span>
<span class="sd"> &gt;&gt;&gt; (df[(&#39;y&#39;, &#39;b&#39;)] + 100).median()</span>
<span class="sd"> 103.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">accuracy</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;accuracy must be an integer; however, got [</span><span class="si">%s</span><span class="s2">]&quot;</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">accuracy</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">median</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="p">(</span><span class="n">BooleanType</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">)):</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">percentile_approx</span><span class="p">(</span><span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">DoubleType</span><span class="p">()),</span> <span class="mf">0.5</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">median</span><span class="p">,</span>
<span class="n">name</span><span class="o">=</span><span class="s2">&quot;median&quot;</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span>
<span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span>
<span class="n">skipna</span><span class="o">=</span><span class="n">skipna</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">sem</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">ddof</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return unbiased standard error of the mean over requested axis.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis: {index (0), columns (1)}</span>
<span class="sd"> Axis for the function to be applied on.</span>
<span class="sd"> skipna: bool, default True</span>
<span class="sd"> Exclude NA/null values when computing the result.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supported including NA/null values.</span>
<span class="sd"> ddof: int, default 1</span>
<span class="sd"> Delta Degrees of Freedom. The divisor used in calculations is N - ddof,</span>
<span class="sd"> where N represents the number of elements.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supported including arbitary integers.</span>
<span class="sd"> numeric_only: bool, default None</span>
<span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span>
<span class="sd"> is mainly for pandas compatibility.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> scalar(for Series) or Series(for DataFrame)</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame({&quot;a&quot;: [1, 2, 3], &quot;b&quot;: [4, 5, 6]})</span>
<span class="sd"> &gt;&gt;&gt; psdf</span>
<span class="sd"> a b</span>
<span class="sd"> 0 1 4</span>
<span class="sd"> 1 2 5</span>
<span class="sd"> 2 3 6</span>
<span class="sd"> &gt;&gt;&gt; psdf.sem()</span>
<span class="sd"> a 0.57735</span>
<span class="sd"> b 0.57735</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; psdf.sem(ddof=0)</span>
<span class="sd"> a 0.471405</span>
<span class="sd"> b 0.471405</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; psdf.sem(ddof=2)</span>
<span class="sd"> a 0.816497</span>
<span class="sd"> b 0.816497</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; psdf.sem(axis=1)</span>
<span class="sd"> 0 1.5</span>
<span class="sd"> 1 1.5</span>
<span class="sd"> 2 1.5</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> Support for Series</span>
<span class="sd"> &gt;&gt;&gt; psser = psdf.a</span>
<span class="sd"> &gt;&gt;&gt; psser</span>
<span class="sd"> 0 1</span>
<span class="sd"> 1 2</span>
<span class="sd"> 2 3</span>
<span class="sd"> Name: a, dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; psser.sem()</span>
<span class="sd"> 0.5773502691896258</span>
<span class="sd"> &gt;&gt;&gt; psser.sem(ddof=0)</span>
<span class="sd"> 0.47140452079103173</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ddof</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;ddof must be integer&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">axis</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;The behavior of DataFrame.sem with axis=None is deprecated, &quot;</span>
<span class="s2">&quot;in a future version this will reduce over both axes and return a scalar. &quot;</span>
<span class="s2">&quot;To retain the old behavior, pass axis=0 (or do not pass axis)&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">numeric_only</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">def</span> <span class="nf">std</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span>
<span class="n">spark_column</span> <span class="o">=</span> <span class="n">spark_column</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">LongType</span><span class="p">())</span>
<span class="k">elif</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">spark_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Could not convert </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">) to numeric&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">spark_type</span><span class="p">),</span> <span class="n">spark_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">SF</span><span class="o">.</span><span class="n">stddev</span><span class="p">(</span><span class="n">spark_column</span><span class="p">,</span> <span class="n">ddof</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">sem</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="k">return</span> <span class="n">std</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span> <span class="o">/</span> <span class="n">F</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="n">Frame</span><span class="o">.</span><span class="n">_count_expr</span><span class="p">(</span><span class="n">psser</span><span class="p">))</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">sem</span><span class="p">,</span>
<span class="n">name</span><span class="o">=</span><span class="s2">&quot;sem&quot;</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span>
<span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span>
<span class="n">ddof</span><span class="o">=</span><span class="n">ddof</span><span class="p">,</span>
<span class="n">skipna</span><span class="o">=</span><span class="n">skipna</span><span class="p">,</span>
<span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">size</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return an int representing the number of elements in this object.</span>
<span class="sd"> Return the number of rows if Series. Otherwise return the number of</span>
<span class="sd"> rows times number of columns if DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series({&#39;a&#39;: 1, &#39;b&#39;: 2, &#39;c&#39;: None})</span>
<span class="sd"> &gt;&gt;&gt; s.size</span>
<span class="sd"> 3</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;col1&#39;: [1, 2, None], &#39;col2&#39;: [3, 4, None]})</span>
<span class="sd"> &gt;&gt;&gt; df.size</span>
<span class="sd"> 6</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame(index=[1, 2, None])</span>
<span class="sd"> &gt;&gt;&gt; df.size</span>
<span class="sd"> 0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">num_columns</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">)</span>
<span class="k">if</span> <span class="n">num_columns</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">return</span> <span class="mi">0</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">*</span> <span class="n">num_columns</span> <span class="c1"># type: ignore[arg-type]</span>
<span class="k">def</span> <span class="nf">abs</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return a Series/DataFrame with absolute numeric value of each element.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> abs: Series/DataFrame containing the absolute value of each element.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Absolute numeric values in a Series.</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([-1.10, 2, -3.33, 4])</span>
<span class="sd"> &gt;&gt;&gt; s.abs()</span>
<span class="sd"> 0 1.10</span>
<span class="sd"> 1 2.00</span>
<span class="sd"> 2 3.33</span>
<span class="sd"> 3 4.00</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> Absolute numeric values in a DataFrame.</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({</span>
<span class="sd"> ... &#39;a&#39;: [4, 5, 6, 7],</span>
<span class="sd"> ... &#39;b&#39;: [10, 20, 30, 40],</span>
<span class="sd"> ... &#39;c&#39;: [100, 50, -30, -50]</span>
<span class="sd"> ... },</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.abs()</span>
<span class="sd"> a b c</span>
<span class="sd"> 0 4 10 100</span>
<span class="sd"> 1 5 20 50</span>
<span class="sd"> 2 6 30 30</span>
<span class="sd"> 3 7 40 50</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="nf">abs</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;Series&quot;</span><span class="p">,</span> <span class="n">Column</span><span class="p">]:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span>
<span class="k">return</span> <span class="n">psser</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span>
<span class="k">return</span> <span class="n">psser</span><span class="o">.</span><span class="n">_with_new_scol</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">abs</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">),</span> <span class="n">field</span><span class="o">=</span><span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;bad operand type for abs(): </span><span class="si">{}</span><span class="s2"> (</span><span class="si">{}</span><span class="s2">)&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">spark_type_to_pandas_dtype</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">),</span>
<span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="o">.</span><span class="n">simpleString</span><span class="p">(),</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span><span class="nb">abs</span><span class="p">)</span>
<span class="c1"># TODO: by argument only support the grouping name and as_index only for now. Documentation</span>
<span class="c1"># should be updated when it&#39;s supported.</span>
<span class="k">def</span> <span class="nf">groupby</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span>
<span class="n">by</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]]],</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Axis</span> <span class="o">=</span> <span class="mi">0</span><span class="p">,</span>
<span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GroupBy[FrameLike]&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Group DataFrame or Series using one or more columns.</span>
<span class="sd"> A groupby operation involves some combination of splitting the</span>
<span class="sd"> object, applying a function, and combining the results. This can be</span>
<span class="sd"> used to group large amounts of data and compute operations on these</span>
<span class="sd"> groups.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> by: Series, label, or list of labels</span>
<span class="sd"> Used to determine the groups for the groupby.</span>
<span class="sd"> If Series is passed, the Series or dict VALUES</span>
<span class="sd"> will be used to determine the groups. A label or list of</span>
<span class="sd"> labels may be passed to group by the columns in ``self``.</span>
<span class="sd"> axis: int, default 0 or &#39;index&#39;</span>
<span class="sd"> Can only be set to 0 now.</span>
<span class="sd"> as_index: bool, default True</span>
<span class="sd"> For aggregated output, return object with group labels as the</span>
<span class="sd"> index. Only relevant for DataFrame input. as_index=False is</span>
<span class="sd"> effectively &quot;SQL-style&quot; grouped output.</span>
<span class="sd"> dropna: bool, default True</span>
<span class="sd"> If True, and if group keys contain NA values,</span>
<span class="sd"> NA values together with row/column will be dropped.</span>
<span class="sd"> If False, NA values will also be treated as the key in groups.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrameGroupBy or SeriesGroupBy</span>
<span class="sd"> Depends on the calling object and returns groupby object that</span>
<span class="sd"> contains information about the groups.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.groupby.GroupBy</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;Animal&#39;: [&#39;Falcon&#39;, &#39;Falcon&#39;,</span>
<span class="sd"> ... &#39;Parrot&#39;, &#39;Parrot&#39;],</span>
<span class="sd"> ... &#39;Max Speed&#39;: [380., 370., 24., 26.]},</span>
<span class="sd"> ... columns=[&#39;Animal&#39;, &#39;Max Speed&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> Animal Max Speed</span>
<span class="sd"> 0 Falcon 380.0</span>
<span class="sd"> 1 Falcon 370.0</span>
<span class="sd"> 2 Parrot 24.0</span>
<span class="sd"> 3 Parrot 26.0</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;Animal&#39;]).mean().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> Max Speed</span>
<span class="sd"> Animal</span>
<span class="sd"> Falcon 375.0</span>
<span class="sd"> Parrot 25.0</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;Animal&#39;], as_index=False).mean().sort_values(&#39;Animal&#39;)</span>
<span class="sd"> ... # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE</span>
<span class="sd"> Animal Max Speed</span>
<span class="sd"> ...Falcon 375.0</span>
<span class="sd"> ...Parrot 25.0</span>
<span class="sd"> We can also choose to include NA in group keys or not by setting dropna parameter,</span>
<span class="sd"> the default setting is True:</span>
<span class="sd"> &gt;&gt;&gt; l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame(l, columns=[&quot;a&quot;, &quot;b&quot;, &quot;c&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(by=[&quot;b&quot;]).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> a c</span>
<span class="sd"> b</span>
<span class="sd"> 1.0 2 3</span>
<span class="sd"> 2.0 2 5</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(by=[&quot;b&quot;], dropna=False).sum().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> a c</span>
<span class="sd"> b</span>
<span class="sd"> 1.0 2 3</span>
<span class="sd"> 2.0 2 5</span>
<span class="sd"> NaN 1 4</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">new_by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Label</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">]]</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">by</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Grouper for &#39;</span><span class="si">{}</span><span class="s2">&#39; not 1-dimensional&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">by</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">by</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="n">new_by</span> <span class="o">=</span> <span class="p">[</span><span class="n">by</span><span class="p">]</span>
<span class="k">elif</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">by</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">by</span><span class="p">)</span>
<span class="n">new_by</span> <span class="o">=</span> <span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="n">by</span><span class="p">)]</span>
<span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">by</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">by</span><span class="p">)</span>
<span class="n">new_by</span> <span class="o">=</span> <span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="p">(</span><span class="n">by</span><span class="p">,))]</span>
<span class="k">elif</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">by</span><span class="p">):</span>
<span class="n">new_by</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">by</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Grouper for &#39;</span><span class="si">{}</span><span class="s2">&#39; not 1-dimensional&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">key</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="n">new_by</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">key</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">key</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">key</span><span class="p">)</span>
<span class="n">new_by</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="n">key</span><span class="p">))</span>
<span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">key</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">key</span><span class="p">)</span>
<span class="n">new_by</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">Label</span><span class="p">,</span> <span class="p">(</span><span class="n">key</span><span class="p">,)))</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;Grouper for &#39;</span><span class="si">{}</span><span class="s2">&#39; not 1-dimensional&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">key</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Grouper for &#39;</span><span class="si">{}</span><span class="s2">&#39; not 1-dimensional&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">by</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">len</span><span class="p">(</span><span class="n">new_by</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;No group keys passed!&quot;</span><span class="p">)</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="n">axis</span> <span class="o">!=</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s1">&#39;axis should be either 0 or &quot;index&quot; currently.&#39;</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_build_groupby</span><span class="p">(</span><span class="n">by</span><span class="o">=</span><span class="n">new_by</span><span class="p">,</span> <span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span> <span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">)</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_build_groupby</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="s2">&quot;Series&quot;</span><span class="p">,</span> <span class="n">Label</span><span class="p">]],</span> <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GroupBy[FrameLike]&quot;</span><span class="p">:</span>
<span class="k">pass</span>
<span class="k">def</span> <span class="nf">bool</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the bool of a single element in the current object.</span>
<span class="sd"> This must be a boolean scalar value, either True or False. Raise a ValueError if</span>
<span class="sd"> the object does not have exactly 1 element, or that element is not boolean</span>
<span class="sd"> .. deprecated:: 4.0.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> bool</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; ps.DataFrame({&#39;a&#39;: [True]}).bool()</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([False]).bool()</span>
<span class="sd"> False</span>
<span class="sd"> If there are non-boolean or multiple values exist, it raises an exception in all</span>
<span class="sd"> cases as below.</span>
<span class="sd"> &gt;&gt;&gt; ps.DataFrame({&#39;a&#39;: [&#39;a&#39;]}).bool()</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> ValueError: bool cannot act on a non-boolean single element DataFrame</span>
<span class="sd"> &gt;&gt;&gt; ps.DataFrame({&#39;a&#39;: [True], &#39;b&#39;: [False]}).bool() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(),</span>
<span class="sd"> a.item(), a.any() or a.all().</span>
<span class="sd"> &gt;&gt;&gt; ps.Series([1]).bool()</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> ValueError: bool cannot act on a non-boolean single element DataFrame</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="si">}</span><span class="s2">.bool is now deprecated &quot;</span>
<span class="s2">&quot;and will be removed in future version.&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="n">df</span> <span class="o">=</span> <span class="bp">self</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="n">df</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_dataframe</span><span class="p">()</span>
<span class="k">return</span> <span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">bool</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">first_valid_index</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="o">...</span><span class="p">]]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Retrieves the index of the first valid value.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> scalar, tuple, or None</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Support for DataFrame</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame({&#39;a&#39;: [None, 2, 3, 2],</span>
<span class="sd"> ... &#39;b&#39;: [None, 2.0, 3.0, 1.0],</span>
<span class="sd"> ... &#39;c&#39;: [None, 200, 400, 200]},</span>
<span class="sd"> ... index=[&#39;Q&#39;, &#39;W&#39;, &#39;E&#39;, &#39;R&#39;])</span>
<span class="sd"> &gt;&gt;&gt; psdf</span>
<span class="sd"> a b c</span>
<span class="sd"> Q NaN NaN NaN</span>
<span class="sd"> W 2.0 2.0 200.0</span>
<span class="sd"> E 3.0 3.0 400.0</span>
<span class="sd"> R 2.0 1.0 200.0</span>
<span class="sd"> &gt;&gt;&gt; psdf.first_valid_index()</span>
<span class="sd"> &#39;W&#39;</span>
<span class="sd"> Support for MultiIndex columns</span>
<span class="sd"> &gt;&gt;&gt; psdf.columns = pd.MultiIndex.from_tuples([(&#39;a&#39;, &#39;x&#39;), (&#39;b&#39;, &#39;y&#39;), (&#39;c&#39;, &#39;z&#39;)])</span>
<span class="sd"> &gt;&gt;&gt; psdf</span>
<span class="sd"> a b c</span>
<span class="sd"> x y z</span>
<span class="sd"> Q NaN NaN NaN</span>
<span class="sd"> W 2.0 2.0 200.0</span>
<span class="sd"> E 3.0 3.0 400.0</span>
<span class="sd"> R 2.0 1.0 200.0</span>
<span class="sd"> &gt;&gt;&gt; psdf.first_valid_index()</span>
<span class="sd"> &#39;W&#39;</span>
<span class="sd"> Support for Series.</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([None, None, 3, 4, 5], index=[100, 200, 300, 400, 500])</span>
<span class="sd"> &gt;&gt;&gt; s</span>
<span class="sd"> 100 NaN</span>
<span class="sd"> 200 NaN</span>
<span class="sd"> 300 3.0</span>
<span class="sd"> 400 4.0</span>
<span class="sd"> 500 5.0</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; s.first_valid_index()</span>
<span class="sd"> 300</span>
<span class="sd"> Support for MultiIndex</span>
<span class="sd"> &gt;&gt;&gt; midx = pd.MultiIndex([[&#39;lama&#39;, &#39;cow&#39;, &#39;falcon&#39;],</span>
<span class="sd"> ... [&#39;speed&#39;, &#39;weight&#39;, &#39;length&#39;]],</span>
<span class="sd"> ... [[0, 0, 0, 1, 1, 1, 2, 2, 2],</span>
<span class="sd"> ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([None, None, None, None, 250, 1.5, 320, 1, 0.3], index=midx)</span>
<span class="sd"> &gt;&gt;&gt; s</span>
<span class="sd"> lama speed NaN</span>
<span class="sd"> weight NaN</span>
<span class="sd"> length NaN</span>
<span class="sd"> cow speed NaN</span>
<span class="sd"> weight 250.0</span>
<span class="sd"> length 1.5</span>
<span class="sd"> falcon speed 320.0</span>
<span class="sd"> weight 1.0</span>
<span class="sd"> length 0.3</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; s.first_valid_index()</span>
<span class="sd"> (&#39;cow&#39;, &#39;weight&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">data_spark_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">data_spark_columns</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="n">cond</span> <span class="o">=</span> <span class="n">reduce</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span> <span class="o">&amp;</span> <span class="n">y</span><span class="p">,</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">isNotNull</span><span class="p">(),</span> <span class="n">data_spark_columns</span><span class="p">))</span>
<span class="k">with</span> <span class="n">sql_conf</span><span class="p">({</span><span class="n">SPARK_CONF_ARROW_ENABLED</span><span class="p">:</span> <span class="kc">False</span><span class="p">}):</span>
<span class="c1"># Disable Arrow to keep row ordering.</span>
<span class="n">first_valid_row</span> <span class="o">=</span> <span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span>
<span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">)</span>
<span class="o">.</span><span class="n">limit</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="o">.</span><span class="n">toPandas</span><span class="p">()</span>
<span class="p">)</span>
<span class="c1"># For Empty Series or DataFrame, returns None.</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">first_valid_row</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="n">first_valid_row</span> <span class="o">=</span> <span class="n">first_valid_row</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">first_valid_row</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">return</span> <span class="n">first_valid_row</span><span class="o">.</span><span class="n">iloc</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="nb">tuple</span><span class="p">(</span><span class="n">first_valid_row</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">last_valid_index</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="o">...</span><span class="p">]]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return index for last non-NA/null value.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> scalar, tuple, or None</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This API only works with PySpark &gt;= 3.0.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Support for DataFrame</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame({&#39;a&#39;: [1, 2, 3, None],</span>
<span class="sd"> ... &#39;b&#39;: [1.0, 2.0, 3.0, None],</span>
<span class="sd"> ... &#39;c&#39;: [100, 200, 400, None]},</span>
<span class="sd"> ... index=[&#39;Q&#39;, &#39;W&#39;, &#39;E&#39;, &#39;R&#39;])</span>
<span class="sd"> &gt;&gt;&gt; psdf</span>
<span class="sd"> a b c</span>
<span class="sd"> Q 1.0 1.0 100.0</span>
<span class="sd"> W 2.0 2.0 200.0</span>
<span class="sd"> E 3.0 3.0 400.0</span>
<span class="sd"> R NaN NaN NaN</span>
<span class="sd"> &gt;&gt;&gt; psdf.last_valid_index() # doctest: +SKIP</span>
<span class="sd"> &#39;E&#39;</span>
<span class="sd"> Support for MultiIndex columns</span>
<span class="sd"> &gt;&gt;&gt; psdf.columns = pd.MultiIndex.from_tuples([(&#39;a&#39;, &#39;x&#39;), (&#39;b&#39;, &#39;y&#39;), (&#39;c&#39;, &#39;z&#39;)])</span>
<span class="sd"> &gt;&gt;&gt; psdf</span>
<span class="sd"> a b c</span>
<span class="sd"> x y z</span>
<span class="sd"> Q 1.0 1.0 100.0</span>
<span class="sd"> W 2.0 2.0 200.0</span>
<span class="sd"> E 3.0 3.0 400.0</span>
<span class="sd"> R NaN NaN NaN</span>
<span class="sd"> &gt;&gt;&gt; psdf.last_valid_index() # doctest: +SKIP</span>
<span class="sd"> &#39;E&#39;</span>
<span class="sd"> Support for Series.</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([1, 2, 3, None, None], index=[100, 200, 300, 400, 500])</span>
<span class="sd"> &gt;&gt;&gt; s</span>
<span class="sd"> 100 1.0</span>
<span class="sd"> 200 2.0</span>
<span class="sd"> 300 3.0</span>
<span class="sd"> 400 NaN</span>
<span class="sd"> 500 NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; s.last_valid_index() # doctest: +SKIP</span>
<span class="sd"> 300</span>
<span class="sd"> Support for MultiIndex</span>
<span class="sd"> &gt;&gt;&gt; midx = pd.MultiIndex([[&#39;lama&#39;, &#39;cow&#39;, &#39;falcon&#39;],</span>
<span class="sd"> ... [&#39;speed&#39;, &#39;weight&#39;, &#39;length&#39;]],</span>
<span class="sd"> ... [[0, 0, 0, 1, 1, 1, 2, 2, 2],</span>
<span class="sd"> ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([250, 1.5, 320, 1, 0.3, None, None, None, None], index=midx)</span>
<span class="sd"> &gt;&gt;&gt; s</span>
<span class="sd"> lama speed 250.0</span>
<span class="sd"> weight 1.5</span>
<span class="sd"> length 320.0</span>
<span class="sd"> cow speed 1.0</span>
<span class="sd"> weight 0.3</span>
<span class="sd"> length NaN</span>
<span class="sd"> falcon speed NaN</span>
<span class="sd"> weight NaN</span>
<span class="sd"> length NaN</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; s.last_valid_index() # doctest: +SKIP</span>
<span class="sd"> (&#39;cow&#39;, &#39;weight&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">data_spark_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">data_spark_columns</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="n">cond</span> <span class="o">=</span> <span class="n">reduce</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">x</span> <span class="o">&amp;</span> <span class="n">y</span><span class="p">,</span> <span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">isNotNull</span><span class="p">(),</span> <span class="n">data_spark_columns</span><span class="p">))</span>
<span class="n">last_valid_rows</span> <span class="o">=</span> <span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span>
<span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">)</span>
<span class="o">.</span><span class="n">tail</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span>
<span class="p">)</span>
<span class="c1"># For Empty Series or DataFrame, returns None.</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">last_valid_rows</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="n">last_valid_row</span> <span class="o">=</span> <span class="n">last_valid_rows</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">last_valid_row</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">return</span> <span class="n">last_valid_row</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="nb">tuple</span><span class="p">(</span><span class="n">last_valid_row</span><span class="p">)</span>
<span class="c1"># TODO: &#39;center&#39;, &#39;win_type&#39;, &#39;on&#39;, &#39;axis&#39; parameter should be implemented.</span>
<span class="k">def</span> <span class="nf">rolling</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">window</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">min_periods</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Rolling[FrameLike]&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Provide rolling transformations.</span>
<span class="sd"> .. note:: &#39;min_periods&#39; in pandas-on-Spark works as a fixed window size unlike pandas.</span>
<span class="sd"> Unlike pandas, NA is also counted as the period. This might be changed</span>
<span class="sd"> soon.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> window: int, or offset</span>
<span class="sd"> Size of the moving window.</span>
<span class="sd"> This is the number of observations used for calculating the statistic.</span>
<span class="sd"> Each window will be a fixed size.</span>
<span class="sd"> min_periods: int, default None</span>
<span class="sd"> Minimum number of observations in window required to have a value</span>
<span class="sd"> (otherwise result is NA).</span>
<span class="sd"> For a window that is specified by an offset, min_periods will default to 1.</span>
<span class="sd"> Otherwise, min_periods will default to the size of the window.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> a Window sub-classed for the operation</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">Rolling</span>
<span class="k">return</span> <span class="n">Rolling</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">window</span><span class="o">=</span><span class="n">window</span><span class="p">,</span> <span class="n">min_periods</span><span class="o">=</span><span class="n">min_periods</span><span class="p">)</span>
<span class="c1"># TODO: &#39;center&#39; and &#39;axis&#39; parameter should be implemented.</span>
<span class="c1"># &#39;axis&#39; implementation, refer https://github.com/databricks/koalas/pull/607</span>
<span class="k">def</span> <span class="nf">expanding</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">min_periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Expanding[FrameLike]&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Provide expanding transformations.</span>
<span class="sd"> .. note:: &#39;min_periods&#39; in pandas-on-Spark works as a fixed window size unlike pandas.</span>
<span class="sd"> Unlike pandas, NA is also counted as the period. This might be changed</span>
<span class="sd"> soon.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> min_periods: int, default 1</span>
<span class="sd"> Minimum number of observations in window required to have a value</span>
<span class="sd"> (otherwise result is NA).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> a Window sub-classed for the operation</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">Expanding</span>
<span class="k">return</span> <span class="n">Expanding</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">min_periods</span><span class="o">=</span><span class="n">min_periods</span><span class="p">)</span>
<span class="c1"># TODO: &#39;adjust&#39;, &#39;axis&#39;, &#39;method&#39; parameter should be implemented.</span>
<span class="k">def</span> <span class="nf">ewm</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span>
<span class="n">com</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">span</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">halflife</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">alpha</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">min_periods</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">ignore_na</span><span class="p">:</span> <span class="n">bool_type</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;ExponentialMoving[FrameLike]&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Provide exponentially weighted window transformations.</span>
<span class="sd"> .. note:: &#39;min_periods&#39; in pandas-on-Spark works as a fixed window size unlike pandas.</span>
<span class="sd"> Unlike pandas, NA is also counted as the period. This might be changed</span>
<span class="sd"> soon.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> com: float, optional</span>
<span class="sd"> Specify decay in terms of center of mass.</span>
<span class="sd"> alpha = 1 / (1 + com), for com &gt;= 0.</span>
<span class="sd"> span: float, optional</span>
<span class="sd"> Specify decay in terms of span.</span>
<span class="sd"> alpha = 2 / (span + 1), for span &gt;= 1.</span>
<span class="sd"> halflife: float, optional</span>
<span class="sd"> Specify decay in terms of half-life.</span>
<span class="sd"> alpha = 1 - exp(-ln(2) / halflife), for halflife &gt; 0.</span>
<span class="sd"> alpha: float, optional</span>
<span class="sd"> Specify smoothing factor alpha directly.</span>
<span class="sd"> 0 &lt; alpha &lt;= 1.</span>
<span class="sd"> min_periods: int, default None</span>
<span class="sd"> Minimum number of observations in window required to have a value</span>
<span class="sd"> (otherwise result is NA).</span>
<span class="sd"> ignore_na: bool, default False</span>
<span class="sd"> Ignore missing values when calculating weights.</span>
<span class="sd"> - When ``ignore_na=False`` (default), weights are based on absolute positions.</span>
<span class="sd"> For example, the weights of :math:`x_0` and :math:`x_2` used in calculating</span>
<span class="sd"> the final weighted average of [:math:`x_0`, None, :math:`x_2`] are</span>
<span class="sd"> :math:`(1-\alpha)^2` and :math:`1` if ``adjust=True``, and</span>
<span class="sd"> :math:`(1-\alpha)^2` and :math:`\alpha` if ``adjust=False``.</span>
<span class="sd"> - When ``ignore_na=True``, weights are based</span>
<span class="sd"> on relative positions. For example, the weights of :math:`x_0` and :math:`x_2`</span>
<span class="sd"> used in calculating the final weighted average of</span>
<span class="sd"> [:math:`x_0`, None, :math:`x_2`] are :math:`1-\alpha` and :math:`1` if</span>
<span class="sd"> ``adjust=True``, and :math:`1-\alpha` and :math:`\alpha` if ``adjust=False``.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> a Window sub-classed for the operation</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">ExponentialMoving</span>
<span class="k">return</span> <span class="n">ExponentialMoving</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">com</span><span class="o">=</span><span class="n">com</span><span class="p">,</span>
<span class="n">span</span><span class="o">=</span><span class="n">span</span><span class="p">,</span>
<span class="n">halflife</span><span class="o">=</span><span class="n">halflife</span><span class="p">,</span>
<span class="n">alpha</span><span class="o">=</span><span class="n">alpha</span><span class="p">,</span>
<span class="n">min_periods</span><span class="o">=</span><span class="n">min_periods</span><span class="p">,</span>
<span class="n">ignore_na</span><span class="o">=</span><span class="n">ignore_na</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">get</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">default</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Any</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Get item from object for given key (DataFrame column, Panel slice,</span>
<span class="sd"> etc.). Returns default value if not found.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> key: object</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> value: same type as items contained in object</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;x&#39;:range(3), &#39;y&#39;:[&#39;a&#39;,&#39;b&#39;,&#39;b&#39;], &#39;z&#39;:[&#39;a&#39;,&#39;b&#39;,&#39;b&#39;]},</span>
<span class="sd"> ... columns=[&#39;x&#39;, &#39;y&#39;, &#39;z&#39;], index=[10, 20, 20])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> x y z</span>
<span class="sd"> 10 0 a a</span>
<span class="sd"> 20 1 b b</span>
<span class="sd"> 20 2 b b</span>
<span class="sd"> &gt;&gt;&gt; df.get(&#39;x&#39;)</span>
<span class="sd"> 10 0</span>
<span class="sd"> 20 1</span>
<span class="sd"> 20 2</span>
<span class="sd"> Name: x, dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; df.get([&#39;x&#39;, &#39;y&#39;])</span>
<span class="sd"> x y</span>
<span class="sd"> 10 0 a</span>
<span class="sd"> 20 1 b</span>
<span class="sd"> 20 2 b</span>
<span class="sd"> &gt;&gt;&gt; df.x.get(10)</span>
<span class="sd"> 0</span>
<span class="sd"> &gt;&gt;&gt; df.x.get(20)</span>
<span class="sd"> 20 1</span>
<span class="sd"> 20 2</span>
<span class="sd"> Name: x, dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; df.x.get(15, -1)</span>
<span class="sd"> -1</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="p">[</span><span class="n">key</span><span class="p">]</span>
<span class="k">except</span> <span class="p">(</span><span class="ne">KeyError</span><span class="p">,</span> <span class="ne">ValueError</span><span class="p">,</span> <span class="ne">IndexError</span><span class="p">):</span>
<span class="k">return</span> <span class="n">default</span>
<span class="k">def</span> <span class="nf">squeeze</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">,</span> <span class="s2">&quot;Series&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Squeeze 1 dimensional axis objects into scalars.</span>
<span class="sd"> Series or DataFrames with a single element are squeezed to a scalar.</span>
<span class="sd"> DataFrames with a single column or a single row are squeezed to a</span>
<span class="sd"> Series. Otherwise the object is unchanged.</span>
<span class="sd"> This method is most useful when you don&#39;t know if your</span>
<span class="sd"> object is a Series or DataFrame, but you do know it has just a single</span>
<span class="sd"> column. In that case you can safely call `squeeze` to ensure you have a</span>
<span class="sd"> Series.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis: {0 or &#39;index&#39;, 1 or &#39;columns&#39;, None}, default None</span>
<span class="sd"> A specific axis to squeeze. By default, all length-1 axes are</span>
<span class="sd"> squeezed.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame, Series, or scalar</span>
<span class="sd"> The projection after squeezing `axis` or all the axes.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.iloc: Integer-location based indexing for selecting scalars.</span>
<span class="sd"> DataFrame.iloc: Integer-location based indexing for selecting Series.</span>
<span class="sd"> Series.to_frame: Inverse of DataFrame.squeeze for a</span>
<span class="sd"> single-column DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; primes = ps.Series([2, 3, 5, 7])</span>
<span class="sd"> Slicing might produce a Series with a single value:</span>
<span class="sd"> &gt;&gt;&gt; even_primes = primes[primes % 2 == 0]</span>
<span class="sd"> &gt;&gt;&gt; even_primes</span>
<span class="sd"> 0 2</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; even_primes.squeeze()</span>
<span class="sd"> 2</span>
<span class="sd"> Squeezing objects with more than one value in every axis does nothing:</span>
<span class="sd"> &gt;&gt;&gt; odd_primes = primes[primes % 2 == 1]</span>
<span class="sd"> &gt;&gt;&gt; odd_primes</span>
<span class="sd"> 1 3</span>
<span class="sd"> 2 5</span>
<span class="sd"> 3 7</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; odd_primes.squeeze()</span>
<span class="sd"> 1 3</span>
<span class="sd"> 2 5</span>
<span class="sd"> 3 7</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> Squeezing is even more effective when used with DataFrames.</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame([[1, 2], [3, 4]], columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> a b</span>
<span class="sd"> 0 1 2</span>
<span class="sd"> 1 3 4</span>
<span class="sd"> Slicing a single column will produce a DataFrame with the columns</span>
<span class="sd"> having only one value:</span>
<span class="sd"> &gt;&gt;&gt; df_a = df[[&#39;a&#39;]]</span>
<span class="sd"> &gt;&gt;&gt; df_a</span>
<span class="sd"> a</span>
<span class="sd"> 0 1</span>
<span class="sd"> 1 3</span>
<span class="sd"> The columns can be squeezed down, resulting in a Series:</span>
<span class="sd"> &gt;&gt;&gt; df_a.squeeze(&#39;columns&#39;)</span>
<span class="sd"> 0 1</span>
<span class="sd"> 1 3</span>
<span class="sd"> Name: a, dtype: int64</span>
<span class="sd"> Slicing a single row from a single column will produce a single</span>
<span class="sd"> scalar DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; df_1a = df.loc[[1], [&#39;a&#39;]]</span>
<span class="sd"> &gt;&gt;&gt; df_1a</span>
<span class="sd"> a</span>
<span class="sd"> 1 3</span>
<span class="sd"> Squeezing the rows produces a single scalar Series:</span>
<span class="sd"> &gt;&gt;&gt; df_1a.squeeze(&#39;rows&#39;)</span>
<span class="sd"> a 3</span>
<span class="sd"> Name: 1, dtype: int64</span>
<span class="sd"> Squeezing all axes will project directly into a scalar:</span>
<span class="sd"> &gt;&gt;&gt; df_1a.squeeze()</span>
<span class="sd"> 3</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">axis</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">axis</span> <span class="o">=</span> <span class="s2">&quot;index&quot;</span> <span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="s2">&quot;rows&quot;</span> <span class="k">else</span> <span class="n">axis</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">first_series</span>
<span class="n">is_squeezable</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">columns</span><span class="p">[:</span><span class="mi">2</span><span class="p">])</span> <span class="o">==</span> <span class="mi">1</span>
<span class="c1"># If DataFrame has multiple columns, there is no change.</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_squeezable</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span>
<span class="n">series_from_column</span> <span class="o">=</span> <span class="n">first_series</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="n">has_single_value</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">series_from_column</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">2</span><span class="p">))</span> <span class="o">==</span> <span class="mi">1</span>
<span class="c1"># If DataFrame has only a single value, use pandas API directly.</span>
<span class="k">if</span> <span class="n">has_single_value</span><span class="p">:</span>
<span class="n">result</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span><span class="o">.</span><span class="n">squeeze</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="k">return</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">(</span><span class="n">result</span><span class="p">)</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">result</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="k">else</span> <span class="n">result</span>
<span class="k">elif</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">series_from_column</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># The case of Series is simple.</span>
<span class="c1"># If Series has only a single value, just return it as a scalar.</span>
<span class="c1"># Otherwise, there is no change.</span>
<span class="n">self_top_two</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="s2">&quot;Series&quot;</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span>
<span class="n">has_single_value</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">self_top_two</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">Scalar</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">],</span> <span class="n">self_top_two</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">if</span> <span class="n">has_single_value</span> <span class="k">else</span> <span class="bp">self</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">truncate</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">before</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">after</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">copy</span><span class="p">:</span> <span class="n">bool_type</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrameOrSeries</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Truncate a Series or DataFrame before and after some index value.</span>
<span class="sd"> This is a useful shorthand for boolean indexing based on index</span>
<span class="sd"> values above or below certain thresholds.</span>
<span class="sd"> .. note:: This API is dependent on :meth:`Index.is_monotonic_increasing`</span>
<span class="sd"> which can be expensive.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> before: date, str, int</span>
<span class="sd"> Truncate all rows before this index value.</span>
<span class="sd"> after: date, str, int</span>
<span class="sd"> Truncate all rows after this index value.</span>
<span class="sd"> axis: {0 or &#39;index&#39;, 1 or &#39;columns&#39;}, optional</span>
<span class="sd"> Axis to truncate. Truncates the index (rows) by default.</span>
<span class="sd"> copy: bool, default is True,</span>
<span class="sd"> Return a copy of the truncated section.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> type of caller</span>
<span class="sd"> The truncated Series or DataFrame.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.loc: Select a subset of a DataFrame by label.</span>
<span class="sd"> DataFrame.iloc: Select a subset of a DataFrame by position.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, &#39;d&#39;, &#39;e&#39;],</span>
<span class="sd"> ... &#39;B&#39;: [&#39;f&#39;, &#39;g&#39;, &#39;h&#39;, &#39;i&#39;, &#39;j&#39;],</span>
<span class="sd"> ... &#39;C&#39;: [&#39;k&#39;, &#39;l&#39;, &#39;m&#39;, &#39;n&#39;, &#39;o&#39;]},</span>
<span class="sd"> ... index=[1, 2, 3, 4, 5])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B C</span>
<span class="sd"> 1 a f k</span>
<span class="sd"> 2 b g l</span>
<span class="sd"> 3 c h m</span>
<span class="sd"> 4 d i n</span>
<span class="sd"> 5 e j o</span>
<span class="sd"> &gt;&gt;&gt; df.truncate(before=2, after=4)</span>
<span class="sd"> A B C</span>
<span class="sd"> 2 b g l</span>
<span class="sd"> 3 c h m</span>
<span class="sd"> 4 d i n</span>
<span class="sd"> The columns of a DataFrame can be truncated.</span>
<span class="sd"> &gt;&gt;&gt; df.truncate(before=&quot;A&quot;, after=&quot;B&quot;, axis=&quot;columns&quot;)</span>
<span class="sd"> A B</span>
<span class="sd"> 1 a f</span>
<span class="sd"> 2 b g</span>
<span class="sd"> 3 c h</span>
<span class="sd"> 4 d i</span>
<span class="sd"> 5 e j</span>
<span class="sd"> For Series, only rows can be truncated.</span>
<span class="sd"> &gt;&gt;&gt; df[&#39;A&#39;].truncate(before=2, after=4)</span>
<span class="sd"> 2 b</span>
<span class="sd"> 3 c</span>
<span class="sd"> 4 d</span>
<span class="sd"> Name: A, dtype: object</span>
<span class="sd"> A Series has index that sorted integers.</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([10, 20, 30, 40, 50, 60, 70],</span>
<span class="sd"> ... index=[1, 2, 3, 4, 5, 6, 7])</span>
<span class="sd"> &gt;&gt;&gt; s</span>
<span class="sd"> 1 10</span>
<span class="sd"> 2 20</span>
<span class="sd"> 3 30</span>
<span class="sd"> 4 40</span>
<span class="sd"> 5 50</span>
<span class="sd"> 6 60</span>
<span class="sd"> 7 70</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; s.truncate(2, 5)</span>
<span class="sd"> 2 20</span>
<span class="sd"> 3 30</span>
<span class="sd"> 4 40</span>
<span class="sd"> 5 50</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> A Series has index that sorted strings.</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([10, 20, 30, 40, 50, 60, 70],</span>
<span class="sd"> ... index=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;, &#39;d&#39;, &#39;e&#39;, &#39;f&#39;, &#39;g&#39;])</span>
<span class="sd"> &gt;&gt;&gt; s</span>
<span class="sd"> a 10</span>
<span class="sd"> b 20</span>
<span class="sd"> c 30</span>
<span class="sd"> d 40</span>
<span class="sd"> e 50</span>
<span class="sd"> f 60</span>
<span class="sd"> g 70</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; s.truncate(&#39;b&#39;, &#39;e&#39;)</span>
<span class="sd"> b 20</span>
<span class="sd"> c 30</span>
<span class="sd"> d 40</span>
<span class="sd"> e 50</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">first_series</span>
<span class="n">axis</span> <span class="o">=</span> <span class="n">validate_axis</span><span class="p">(</span><span class="n">axis</span><span class="p">)</span>
<span class="n">indexes</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">index</span>
<span class="n">indexes_increasing</span> <span class="o">=</span> <span class="n">indexes</span><span class="o">.</span><span class="n">is_monotonic_increasing</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">indexes_increasing</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">indexes</span><span class="o">.</span><span class="n">is_monotonic_decreasing</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;truncate requires a sorted index&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="p">(</span><span class="n">before</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">)</span> <span class="ow">and</span> <span class="p">(</span><span class="n">after</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">):</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">],</span> <span class="bp">self</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> <span class="k">if</span> <span class="n">copy</span> <span class="k">else</span> <span class="bp">self</span><span class="p">)</span>
<span class="k">if</span> <span class="p">(</span><span class="n">before</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">after</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">)</span> <span class="ow">and</span> <span class="n">before</span> <span class="o">&gt;</span> <span class="n">after</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;Truncate: </span><span class="si">%s</span><span class="s2"> must be after </span><span class="si">%s</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">after</span><span class="p">,</span> <span class="n">before</span><span class="p">))</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">):</span>
<span class="k">if</span> <span class="n">indexes_increasing</span><span class="p">:</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">first_series</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">before</span><span class="p">:</span><span class="n">after</span><span class="p">]</span> <span class="c1"># type: ignore[arg-type]</span>
<span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">result</span> <span class="o">=</span> <span class="n">first_series</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">to_frame</span><span class="p">()</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">after</span><span class="p">:</span><span class="n">before</span><span class="p">]</span> <span class="c1"># type: ignore[arg-type]</span>
<span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="k">if</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">if</span> <span class="n">indexes_increasing</span><span class="p">:</span>
<span class="n">result</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">before</span><span class="p">:</span><span class="n">after</span><span class="p">]</span> <span class="c1"># type: ignore[assignment]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">result</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="n">after</span><span class="p">:</span><span class="n">before</span><span class="p">]</span> <span class="c1"># type: ignore[assignment]</span>
<span class="k">elif</span> <span class="n">axis</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">result</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">loc</span><span class="p">[:,</span> <span class="n">before</span><span class="p">:</span><span class="n">after</span><span class="p">]</span> <span class="c1"># type: ignore[assignment]</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrameOrSeries</span><span class="p">,</span> <span class="n">result</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> <span class="k">if</span> <span class="n">copy</span> <span class="k">else</span> <span class="n">result</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">to_markdown</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">buf</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">IO</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">mode</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Print Series or DataFrame in Markdown-friendly format.</span>
<span class="sd"> .. note:: This method should only be used if the resulting pandas object is expected</span>
<span class="sd"> to be small, as all the data is loaded into the driver&#39;s memory.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> buf: writable buffer, defaults to sys.stdout</span>
<span class="sd"> Where to send the output. By default, the output is printed to</span>
<span class="sd"> sys.stdout. Pass a writable buffer if you need to further process</span>
<span class="sd"> the output.</span>
<span class="sd"> mode: str, optional</span>
<span class="sd"> Mode in which file is opened.</span>
<span class="sd"> **kwargs</span>
<span class="sd"> These parameters will be passed to `tabulate`.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> str</span>
<span class="sd"> Series or DataFrame in Markdown-friendly format.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Requires the `tabulate &lt;https://pypi.org/project/tabulate&gt;`_ package.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; psser = ps.Series([&quot;elk&quot;, &quot;pig&quot;, &quot;dog&quot;, &quot;quetzal&quot;], name=&quot;animal&quot;)</span>
<span class="sd"> &gt;&gt;&gt; print(psser.to_markdown()) # doctest: +SKIP</span>
<span class="sd"> | | animal |</span>
<span class="sd"> |---:|:---------|</span>
<span class="sd"> | 0 | elk |</span>
<span class="sd"> | 1 | pig |</span>
<span class="sd"> | 2 | dog |</span>
<span class="sd"> | 3 | quetzal |</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame(</span>
<span class="sd"> ... data={&quot;animal_1&quot;: [&quot;elk&quot;, &quot;pig&quot;], &quot;animal_2&quot;: [&quot;dog&quot;, &quot;quetzal&quot;]}</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; print(psdf.to_markdown()) # doctest: +SKIP</span>
<span class="sd"> | | animal_1 | animal_2 |</span>
<span class="sd"> |---:|:-----------|:-----------|</span>
<span class="sd"> | 0 | elk | dog |</span>
<span class="sd"> | 1 | pig | quetzal |</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;`to_markdown` loads all data into the driver&#39;s memory. &quot;</span>
<span class="s2">&quot;It should only be used if the resulting pandas object is expected to be small.&quot;</span>
<span class="p">)</span>
<span class="c1"># Make sure locals() call is at the top of the function so we don&#39;t capture local variables.</span>
<span class="n">args</span> <span class="o">=</span> <span class="nb">locals</span><span class="p">()</span>
<span class="n">psser_or_psdf</span> <span class="o">=</span> <span class="bp">self</span>
<span class="n">internal_pandas</span> <span class="o">=</span> <span class="n">psser_or_psdf</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span>
<span class="k">return</span> <span class="n">validate_arguments_and_invoke_function</span><span class="p">(</span>
<span class="n">internal_pandas</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">to_markdown</span><span class="p">,</span> <span class="nb">type</span><span class="p">(</span><span class="n">internal_pandas</span><span class="p">)</span><span class="o">.</span><span class="n">to_markdown</span><span class="p">,</span> <span class="n">args</span>
<span class="p">)</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">fillna</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span>
<span class="n">value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">method</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inplace</span><span class="p">:</span> <span class="n">bool_type</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="k">pass</span>
<span class="c1"># TODO: add &#39;downcast&#39; when value parameter exists</span>
<span class="k">def</span> <span class="nf">bfill</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inplace</span><span class="p">:</span> <span class="n">bool_type</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Synonym for `DataFrame.fillna()` or `Series.fillna()` with ``method=`bfill```.</span>
<span class="sd"> .. note:: the current implementation of &#39;bfill&#39; uses Spark&#39;s Window</span>
<span class="sd"> without specifying partition specification. This leads to moveing all data into a</span>
<span class="sd"> single partition in a single machine and could cause serious</span>
<span class="sd"> performance degradation. Avoid this method with very large datasets.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis: {0 or `index`}</span>
<span class="sd"> 1 and `columns` are not supported.</span>
<span class="sd"> inplace: boolean, default False</span>
<span class="sd"> Fill in place (do not create a new object)</span>
<span class="sd"> limit: int, default None</span>
<span class="sd"> If method is specified, this is the maximum number of consecutive NaN values to</span>
<span class="sd"> forward/backward fill. In other words, if there is a gap with more than this number of</span>
<span class="sd"> consecutive NaNs, it will only be partially filled. If method is not specified,</span>
<span class="sd"> this is the maximum number of entries along the entire axis where NaNs will be filled.</span>
<span class="sd"> Must be greater than 0 if not None</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame or Series</span>
<span class="sd"> DataFrame or Series with NA entries filled.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame({</span>
<span class="sd"> ... &#39;A&#39;: [None, 3, None, None],</span>
<span class="sd"> ... &#39;B&#39;: [2, 4, None, 3],</span>
<span class="sd"> ... &#39;C&#39;: [None, None, None, 1],</span>
<span class="sd"> ... &#39;D&#39;: [0, 1, 5, 4]</span>
<span class="sd"> ... },</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;, &#39;D&#39;])</span>
<span class="sd"> &gt;&gt;&gt; psdf</span>
<span class="sd"> A B C D</span>
<span class="sd"> 0 NaN 2.0 NaN 0</span>
<span class="sd"> 1 3.0 4.0 NaN 1</span>
<span class="sd"> 2 NaN NaN NaN 5</span>
<span class="sd"> 3 NaN 3.0 1.0 4</span>
<span class="sd"> Propagate non-null values backward.</span>
<span class="sd"> &gt;&gt;&gt; psdf.bfill()</span>
<span class="sd"> A B C D</span>
<span class="sd"> 0 3.0 2.0 1.0 0</span>
<span class="sd"> 1 3.0 4.0 1.0 1</span>
<span class="sd"> 2 NaN 3.0 1.0 5</span>
<span class="sd"> 3 NaN 3.0 1.0 4</span>
<span class="sd"> For Series</span>
<span class="sd"> &gt;&gt;&gt; psser = ps.Series([None, None, None, 1])</span>
<span class="sd"> &gt;&gt;&gt; psser</span>
<span class="sd"> 0 NaN</span>
<span class="sd"> 1 NaN</span>
<span class="sd"> 2 NaN</span>
<span class="sd"> 3 1.0</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; psser.bfill()</span>
<span class="sd"> 0 1.0</span>
<span class="sd"> 1 1.0</span>
<span class="sd"> 2 1.0</span>
<span class="sd"> 3 1.0</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="n">method</span><span class="o">=</span><span class="s2">&quot;bfill&quot;</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">inplace</span><span class="o">=</span><span class="n">inplace</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span>
<span class="n">backfill</span> <span class="o">=</span> <span class="n">bfill</span>
<span class="c1"># TODO: add &#39;downcast&#39; when value parameter exists</span>
<span class="k">def</span> <span class="nf">ffill</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inplace</span><span class="p">:</span> <span class="n">bool_type</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Synonym for `DataFrame.fillna()` or `Series.fillna()` with ``method=`ffill```.</span>
<span class="sd"> .. note:: the current implementation of &#39;ffill&#39; uses Spark&#39;s Window</span>
<span class="sd"> without specifying partition specification. This leads to moveing all data into a</span>
<span class="sd"> single a partition in a single machine and could cause serious</span>
<span class="sd"> performance degradation. Avoid this method with very large datasets.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis: {0 or `index`}</span>
<span class="sd"> 1 and `columns` are not supported.</span>
<span class="sd"> inplace: boolean, default False</span>
<span class="sd"> Fill in place (do not create a new object)</span>
<span class="sd"> limit: int, default None</span>
<span class="sd"> If method is specified, this is the maximum number of consecutive NaN values to</span>
<span class="sd"> forward/backward fill. In other words, if there is a gap with more than this number of</span>
<span class="sd"> consecutive NaNs, it will only be partially filled. If method is not specified,</span>
<span class="sd"> this is the maximum number of entries along the entire axis where NaNs will be filled.</span>
<span class="sd"> Must be greater than 0 if not None</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame or Series</span>
<span class="sd"> DataFrame or Series with NA entries filled.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame({</span>
<span class="sd"> ... &#39;A&#39;: [None, 3, None, None],</span>
<span class="sd"> ... &#39;B&#39;: [2, 4, None, 3],</span>
<span class="sd"> ... &#39;C&#39;: [None, None, None, 1],</span>
<span class="sd"> ... &#39;D&#39;: [0, 1, 5, 4]</span>
<span class="sd"> ... },</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;, &#39;D&#39;])</span>
<span class="sd"> &gt;&gt;&gt; psdf</span>
<span class="sd"> A B C D</span>
<span class="sd"> 0 NaN 2.0 NaN 0</span>
<span class="sd"> 1 3.0 4.0 NaN 1</span>
<span class="sd"> 2 NaN NaN NaN 5</span>
<span class="sd"> 3 NaN 3.0 1.0 4</span>
<span class="sd"> Propagate non-null values forward.</span>
<span class="sd"> &gt;&gt;&gt; psdf.ffill()</span>
<span class="sd"> A B C D</span>
<span class="sd"> 0 NaN 2.0 NaN 0</span>
<span class="sd"> 1 3.0 4.0 NaN 1</span>
<span class="sd"> 2 3.0 4.0 NaN 5</span>
<span class="sd"> 3 3.0 3.0 1.0 4</span>
<span class="sd"> For Series</span>
<span class="sd"> &gt;&gt;&gt; psser = ps.Series([2, 4, None, 3])</span>
<span class="sd"> &gt;&gt;&gt; psser</span>
<span class="sd"> 0 2.0</span>
<span class="sd"> 1 4.0</span>
<span class="sd"> 2 NaN</span>
<span class="sd"> 3 3.0</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; psser.ffill()</span>
<span class="sd"> 0 2.0</span>
<span class="sd"> 1 4.0</span>
<span class="sd"> 2 4.0</span>
<span class="sd"> 3 3.0</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="n">method</span><span class="o">=</span><span class="s2">&quot;ffill&quot;</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">inplace</span><span class="o">=</span><span class="n">inplace</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span>
<span class="n">pad</span> <span class="o">=</span> <span class="n">ffill</span>
<span class="c1"># TODO: add &#39;axis&#39;, &#39;inplace&#39;, &#39;downcast&#39;</span>
<span class="k">def</span> <span class="nf">interpolate</span><span class="p">(</span>
<span class="bp">self</span><span class="p">:</span> <span class="n">FrameLike</span><span class="p">,</span>
<span class="n">method</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;linear&quot;</span><span class="p">,</span>
<span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">limit_direction</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">limit_area</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Fill NaN values using an interpolation method.</span>
<span class="sd"> .. note:: the current implementation of interpolate uses Spark&#39;s Window without</span>
<span class="sd"> specifying partition specification. This leads to moveing all data into a</span>
<span class="sd"> single partition in a single machine and could cause serious</span>
<span class="sd"> performance degradation. Avoid this method with very large datasets.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> method: str, default &#39;linear&#39;</span>
<span class="sd"> Interpolation technique to use. One of:</span>
<span class="sd"> * &#39;linear&#39;: Ignore the index and treat the values as equally</span>
<span class="sd"> spaced.</span>
<span class="sd"> limit: int, optional</span>
<span class="sd"> Maximum number of consecutive NaNs to fill. Must be greater than</span>
<span class="sd"> 0.</span>
<span class="sd"> limit_direction: str, default None</span>
<span class="sd"> Consecutive NaNs will be filled in this direction.</span>
<span class="sd"> One of {{&#39;forward&#39;, &#39;backward&#39;, &#39;both&#39;}}.</span>
<span class="sd"> limit_area: str, default None</span>
<span class="sd"> If limit is specified, consecutive NaNs will be filled with this restriction. One of:</span>
<span class="sd"> * None: No fill restriction.</span>
<span class="sd"> * &#39;inside&#39;: Only fill NaNs surrounded by valid values (interpolate).</span>
<span class="sd"> * &#39;outside&#39;: Only fill NaNs outside valid values (extrapolate).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or DataFrame or None</span>
<span class="sd"> Returns the same object type as the caller, interpolated at</span>
<span class="sd"> some or all NA values.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> fillna: Fill missing values using different methods.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Filling in NA via linear interpolation.</span>
<span class="sd"> &gt;&gt;&gt; s = ps.Series([0, 1, np.nan, 3])</span>
<span class="sd"> &gt;&gt;&gt; s</span>
<span class="sd"> 0 0.0</span>
<span class="sd"> 1 1.0</span>
<span class="sd"> 2 NaN</span>
<span class="sd"> 3 3.0</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; s.interpolate()</span>
<span class="sd"> 0 0.0</span>
<span class="sd"> 1 1.0</span>
<span class="sd"> 2 2.0</span>
<span class="sd"> 3 3.0</span>
<span class="sd"> dtype: float64</span>
<span class="sd"> Fill the DataFrame forward (that is, going down) along each column</span>
<span class="sd"> using linear interpolation.</span>
<span class="sd"> Note how the last entry in column &#39;a&#39; is interpolated differently,</span>
<span class="sd"> because there is no entry after it to use for interpolation.</span>
<span class="sd"> Note how the first entry in column &#39;b&#39; remains NA, because there</span>
<span class="sd"> is no entry before it to use for interpolation.</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame([(0.0, np.nan, -1.0, 1.0),</span>
<span class="sd"> ... (np.nan, 2.0, np.nan, np.nan),</span>
<span class="sd"> ... (2.0, 3.0, np.nan, 9.0),</span>
<span class="sd"> ... (np.nan, 4.0, -4.0, 16.0)],</span>
<span class="sd"> ... columns=list(&#39;abcd&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> a b c d</span>
<span class="sd"> 0 0.0 NaN -1.0 1.0</span>
<span class="sd"> 1 NaN 2.0 NaN NaN</span>
<span class="sd"> 2 2.0 3.0 NaN 9.0</span>
<span class="sd"> 3 NaN 4.0 -4.0 16.0</span>
<span class="sd"> &gt;&gt;&gt; df.interpolate(method=&#39;linear&#39;)</span>
<span class="sd"> a b c d</span>
<span class="sd"> 0 0.0 NaN -1.0 1.0</span>
<span class="sd"> 1 1.0 2.0 -2.0 5.0</span>
<span class="sd"> 2 2.0 3.0 -3.0 9.0</span>
<span class="sd"> 3 2.0 4.0 -4.0 16.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">interpolate</span><span class="p">(</span>
<span class="n">method</span><span class="o">=</span><span class="n">method</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">,</span> <span class="n">limit_direction</span><span class="o">=</span><span class="n">limit_direction</span><span class="p">,</span> <span class="n">limit_area</span><span class="o">=</span><span class="n">limit_area</span>
<span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">at</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">AtIndexer</span><span class="p">:</span>
<span class="k">return</span> <span class="n">AtIndexer</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="n">at</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">AtIndexer</span><span class="o">.</span><span class="vm">__doc__</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">iat</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">iAtIndexer</span><span class="p">:</span>
<span class="k">return</span> <span class="n">iAtIndexer</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="n">iat</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">iAtIndexer</span><span class="o">.</span><span class="vm">__doc__</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">iloc</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">iLocIndexer</span><span class="p">:</span>
<span class="k">return</span> <span class="n">iLocIndexer</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="n">iloc</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">iLocIndexer</span><span class="o">.</span><span class="vm">__doc__</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">loc</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">LocIndexer</span><span class="p">:</span>
<span class="k">return</span> <span class="n">LocIndexer</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="n">loc</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">LocIndexer</span><span class="o">.</span><span class="vm">__doc__</span>
<span class="k">def</span> <span class="fm">__bool__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">NoReturn</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;The truth value of a </span><span class="si">{0}</span><span class="s2"> is ambiguous. &quot;</span>
<span class="s2">&quot;Use a.empty, a.bool(), a.item(), a.any() or a.all().&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="p">)</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_count_expr</span><span class="p">(</span><span class="n">psser</span><span class="p">:</span> <span class="s2">&quot;Series&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">nan_to_null</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_test</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">doctest</span>
<span class="kn">import</span> <span class="nn">shutil</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">import</span> <span class="nn">tempfile</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span>
<span class="kn">import</span> <span class="nn">pyspark.pandas.generic</span>
<span class="n">os</span><span class="o">.</span><span class="n">chdir</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;SPARK_HOME&quot;</span><span class="p">])</span>
<span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">generic</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;ps&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span>
<span class="n">spark</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">&quot;local[4]&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">&quot;pyspark.pandas.generic tests&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="p">)</span>
<span class="n">path</span> <span class="o">=</span> <span class="n">tempfile</span><span class="o">.</span><span class="n">mkdtemp</span><span class="p">()</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;path&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">path</span>
<span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span>
<span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">generic</span><span class="p">,</span>
<span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span>
<span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">shutil</span><span class="o">.</span><span class="n">rmtree</span><span class="p">(</span><span class="n">path</span><span class="p">,</span> <span class="n">ignore_errors</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
<span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span>
<span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
<span class="n">_test</span><span class="p">()</span>
</pre></div>
</article>
<footer class="bd-footer-article">
<div class="footer-article-items footer-article__inner">
<div class="footer-article-item"><!-- Previous / next buttons -->
<div class="prev-next-area">
</div></div>
</div>
</footer>
</div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script>
<script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item"><p class="copyright">
Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
</p></div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item"><p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3.
</p></div>
</div>
</div>
</footer>
</body>
</html>