blob: 6f1429580295f94e938e34c87f66e6858c08b6c7 [file] [log] [blame]
<!DOCTYPE html>
<html >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>pyspark.pandas.groupby &#8212; PySpark 4.0.0-preview1 documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../../../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52" />
<link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" />
<script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/jquery.js"></script>
<script src="../../../_static/underscore.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/clipboard.min.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = '_modules/pyspark/pandas/groupby';</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/pandas/groupby.html" />
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a class="skip-link" href="#main-content">Skip to main content</a>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../../../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<nav class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../../../index.html">
<img src="../../../_static/spark-logo-light.png" class="logo__image only-light" alt="Logo image"/>
<script>document.write(`<img src="../../../_static/spark-logo-dark.png" class="logo__image only-dark" alt="Logo image"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/pandas/groupby.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
</div>
</nav>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar hide-on-wide">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/pandas/groupby.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumbs">
<ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb">
<li class="breadcrumb-item breadcrumb-home">
<a href="../../../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="../../index.html" class="nav-link">Module code</a></li>
<li class="breadcrumb-item active" aria-current="page">pyspark.pandas.groupby</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article" role="main">
<h1>Source code for pyspark.pandas.groupby</h1><div class="highlight"><pre>
<span></span><span class="c1">#</span>
<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c1"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c1"># this work for additional information regarding copyright ownership.</span>
<span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c1"># (the &quot;License&quot;); you may not use this file except in compliance with</span>
<span class="c1"># the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c1"># distributed under the License is distributed on an &quot;AS IS&quot; BASIS,</span>
<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="c1">#</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd">A wrapper for GroupedData to behave like pandas GroupBy.</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">abc</span> <span class="kn">import</span> <span class="n">ABCMeta</span><span class="p">,</span> <span class="n">abstractmethod</span>
<span class="kn">import</span> <span class="nn">inspect</span>
<span class="kn">from</span> <span class="nn">collections</span> <span class="kn">import</span> <span class="n">defaultdict</span><span class="p">,</span> <span class="n">namedtuple</span>
<span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">partial</span>
<span class="kn">from</span> <span class="nn">itertools</span> <span class="kn">import</span> <span class="n">product</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">Any</span><span class="p">,</span>
<span class="n">Callable</span><span class="p">,</span>
<span class="n">Dict</span><span class="p">,</span>
<span class="n">Generic</span><span class="p">,</span>
<span class="n">Iterator</span><span class="p">,</span>
<span class="n">Mapping</span><span class="p">,</span>
<span class="n">List</span><span class="p">,</span>
<span class="n">Optional</span><span class="p">,</span>
<span class="n">Sequence</span><span class="p">,</span>
<span class="n">Set</span><span class="p">,</span>
<span class="n">Tuple</span><span class="p">,</span>
<span class="n">Type</span><span class="p">,</span>
<span class="n">Union</span><span class="p">,</span>
<span class="n">cast</span><span class="p">,</span>
<span class="n">TYPE_CHECKING</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">import</span> <span class="nn">warnings</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<span class="kn">from</span> <span class="nn">pandas.api.types</span> <span class="kn">import</span> <span class="n">is_number</span><span class="p">,</span> <span class="n">is_hashable</span><span class="p">,</span> <span class="n">is_list_like</span> <span class="c1"># type: ignore[attr-defined]</span>
<span class="kn">from</span> <span class="nn">pandas.core.common</span> <span class="kn">import</span> <span class="n">_builtin_table</span> <span class="c1"># type: ignore[attr-defined]</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Column</span><span class="p">,</span> <span class="n">DataFrame</span> <span class="k">as</span> <span class="n">SparkDataFrame</span><span class="p">,</span> <span class="n">Window</span><span class="p">,</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">F</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">BooleanType</span><span class="p">,</span>
<span class="n">DataType</span><span class="p">,</span>
<span class="n">DoubleType</span><span class="p">,</span>
<span class="n">NumericType</span><span class="p">,</span>
<span class="n">StructField</span><span class="p">,</span>
<span class="n">StructType</span><span class="p">,</span>
<span class="n">StringType</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">pandas</span> <span class="k">as</span> <span class="n">ps</span> <span class="c1"># For running doctests and reference resolution in PyCharm.</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas._typing</span> <span class="kn">import</span> <span class="n">Axis</span><span class="p">,</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">Label</span><span class="p">,</span> <span class="n">Name</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.typedef</span> <span class="kn">import</span> <span class="n">infer_return_type</span><span class="p">,</span> <span class="n">DataFrameType</span><span class="p">,</span> <span class="n">ScalarType</span><span class="p">,</span> <span class="n">SeriesType</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.frame</span> <span class="kn">import</span> <span class="n">DataFrame</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.internal</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">InternalField</span><span class="p">,</span>
<span class="n">InternalFrame</span><span class="p">,</span>
<span class="n">HIDDEN_COLUMNS</span><span class="p">,</span>
<span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span>
<span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">,</span>
<span class="n">SPARK_DEFAULT_SERIES_NAME</span><span class="p">,</span>
<span class="n">SPARK_INDEX_NAME_PATTERN</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.missing.groupby</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">MissingPandasLikeDataFrameGroupBy</span><span class="p">,</span>
<span class="n">MissingPandasLikeSeriesGroupBy</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span><span class="p">,</span> <span class="n">first_series</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.spark</span> <span class="kn">import</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">SF</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.config</span> <span class="kn">import</span> <span class="n">get_option</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.correlation</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">compute</span><span class="p">,</span>
<span class="n">CORRELATION_VALUE_1_COLUMN</span><span class="p">,</span>
<span class="n">CORRELATION_VALUE_2_COLUMN</span><span class="p">,</span>
<span class="n">CORRELATION_CORR_OUTPUT_COLUMN</span><span class="p">,</span>
<span class="n">CORRELATION_COUNT_OUTPUT_COLUMN</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.utils</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">align_diff_frames</span><span class="p">,</span>
<span class="n">is_name_like_tuple</span><span class="p">,</span>
<span class="n">is_name_like_value</span><span class="p">,</span>
<span class="n">name_like_string</span><span class="p">,</span>
<span class="n">same_anchor</span><span class="p">,</span>
<span class="n">scol_for</span><span class="p">,</span>
<span class="n">verify_temp_column_name</span><span class="p">,</span>
<span class="n">log_advice</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.spark.utils</span> <span class="kn">import</span> <span class="n">as_nullable_spark_type</span><span class="p">,</span> <span class="n">force_decimal_precision_scale</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.exceptions</span> <span class="kn">import</span> <span class="n">DataError</span>
<span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">RollingGroupby</span><span class="p">,</span> <span class="n">ExpandingGroupby</span><span class="p">,</span> <span class="n">ExponentialMovingGroupby</span>
<span class="c1"># to keep it the same as pandas</span>
<span class="n">NamedAgg</span> <span class="o">=</span> <span class="n">namedtuple</span><span class="p">(</span><span class="s2">&quot;NamedAgg&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;column&quot;</span><span class="p">,</span> <span class="s2">&quot;aggfunc&quot;</span><span class="p">])</span>
<span class="k">class</span> <span class="nc">GroupBy</span><span class="p">(</span><span class="n">Generic</span><span class="p">[</span><span class="n">FrameLike</span><span class="p">],</span> <span class="n">metaclass</span><span class="o">=</span><span class="n">ABCMeta</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> :ivar _psdf: The parent dataframe that is used to perform the groupby</span>
<span class="sd"> :type _psdf: DataFrame</span>
<span class="sd"> :ivar _groupkeys: The list of keys that will be used to perform the grouping</span>
<span class="sd"> :type _groupkeys: List[Series]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span>
<span class="n">groupkeys</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span>
<span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span>
<span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span>
<span class="n">column_labels_to_exclude</span><span class="p">:</span> <span class="n">Set</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span>
<span class="n">agg_columns_selected</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span>
<span class="n">agg_columns</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span>
<span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span> <span class="o">=</span> <span class="n">psdf</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span> <span class="o">=</span> <span class="n">groupkeys</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span> <span class="o">=</span> <span class="n">as_index</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span> <span class="o">=</span> <span class="n">dropna</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_column_labels_to_exclude</span> <span class="o">=</span> <span class="n">column_labels_to_exclude</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span> <span class="o">=</span> <span class="n">agg_columns_selected</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> <span class="o">=</span> <span class="n">agg_columns</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">_groupkeys_scols</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">]:</span>
<span class="k">return</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">]</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">_agg_columns_scols</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">]:</span>
<span class="k">return</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">]</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_apply_series_op</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">op</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">&quot;SeriesGroupBy&quot;</span><span class="p">],</span> <span class="n">Series</span><span class="p">],</span>
<span class="n">should_resolve</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="k">pass</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_handle_output</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">agg_column_names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="k">pass</span>
<span class="c1"># TODO: Series support is not implemented yet.</span>
<span class="c1"># TODO: not all arguments are implemented comparing to pandas&#39; for now.</span>
<span class="k">def</span> <span class="nf">aggregate</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">func_or_funcs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Aggregate using one or more operations over the specified axis.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> func_or_funcs : dict, str or list</span>
<span class="sd"> a dict mapping from column name (string) to</span>
<span class="sd"> aggregate functions (string or list of strings).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or DataFrame</span>
<span class="sd"> The return can be:</span>
<span class="sd"> * Series : when DataFrame.agg is called with a single function</span>
<span class="sd"> * DataFrame : when DataFrame.agg is called with several functions</span>
<span class="sd"> Return Series or DataFrame.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> `agg` is an alias for `aggregate`. Use the alias.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [1, 1, 2, 2],</span>
<span class="sd"> ... &#39;B&#39;: [1, 2, 3, 4],</span>
<span class="sd"> ... &#39;C&#39;: [0.362, 0.227, 1.267, -0.562]},</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B C</span>
<span class="sd"> 0 1 1 0.362</span>
<span class="sd"> 1 1 2 0.227</span>
<span class="sd"> 2 2 3 1.267</span>
<span class="sd"> 3 2 4 -0.562</span>
<span class="sd"> Different aggregations per column</span>
<span class="sd"> &gt;&gt;&gt; aggregated = df.groupby(&#39;A&#39;).agg({&#39;B&#39;: &#39;min&#39;, &#39;C&#39;: &#39;sum&#39;})</span>
<span class="sd"> &gt;&gt;&gt; aggregated[[&#39;B&#39;, &#39;C&#39;]].sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B C</span>
<span class="sd"> A</span>
<span class="sd"> 1 1 0.589</span>
<span class="sd"> 2 3 0.705</span>
<span class="sd"> &gt;&gt;&gt; aggregated = df.groupby(&#39;A&#39;).agg({&#39;B&#39;: [&#39;min&#39;, &#39;max&#39;]})</span>
<span class="sd"> &gt;&gt;&gt; aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B</span>
<span class="sd"> min max</span>
<span class="sd"> A</span>
<span class="sd"> 1 1 2</span>
<span class="sd"> 2 3 4</span>
<span class="sd"> &gt;&gt;&gt; aggregated = df.groupby(&#39;A&#39;).agg(&#39;min&#39;)</span>
<span class="sd"> &gt;&gt;&gt; aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B C</span>
<span class="sd"> A</span>
<span class="sd"> 1 1 0.227</span>
<span class="sd"> 2 3 -0.562</span>
<span class="sd"> &gt;&gt;&gt; aggregated = df.groupby(&#39;A&#39;).agg([&#39;min&#39;, &#39;max&#39;])</span>
<span class="sd"> &gt;&gt;&gt; aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B C</span>
<span class="sd"> min max min max</span>
<span class="sd"> A</span>
<span class="sd"> 1 1 2 0.227 0.362</span>
<span class="sd"> 2 3 4 -0.562 1.267</span>
<span class="sd"> To control the output names with different aggregations per column, pandas-on-Spark</span>
<span class="sd"> also supports &#39;named aggregation&#39; or nested renaming in .agg. It can also be</span>
<span class="sd"> used when applying multiple aggregation functions to specific columns.</span>
<span class="sd"> &gt;&gt;&gt; aggregated = df.groupby(&#39;A&#39;).agg(b_max=ps.NamedAgg(column=&#39;B&#39;, aggfunc=&#39;max&#39;))</span>
<span class="sd"> &gt;&gt;&gt; aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> b_max</span>
<span class="sd"> A</span>
<span class="sd"> 1 2</span>
<span class="sd"> 2 4</span>
<span class="sd"> &gt;&gt;&gt; aggregated = df.groupby(&#39;A&#39;).agg(b_max=(&#39;B&#39;, &#39;max&#39;), b_min=(&#39;B&#39;, &#39;min&#39;))</span>
<span class="sd"> &gt;&gt;&gt; aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> b_max b_min</span>
<span class="sd"> A</span>
<span class="sd"> 1 2 1</span>
<span class="sd"> 2 4 3</span>
<span class="sd"> &gt;&gt;&gt; aggregated = df.groupby(&#39;A&#39;).agg(b_max=(&#39;B&#39;, &#39;max&#39;), c_min=(&#39;C&#39;, &#39;min&#39;))</span>
<span class="sd"> &gt;&gt;&gt; aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> b_max c_min</span>
<span class="sd"> A</span>
<span class="sd"> 1 2 0.227</span>
<span class="sd"> 2 4 -0.562</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># I think current implementation of func and arguments in pandas-on-Spark for aggregate</span>
<span class="c1"># is different than pandas, later once arguments are added, this could be removed.</span>
<span class="k">if</span> <span class="n">func_or_funcs</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">kwargs</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;No aggregation argument or function specified.&quot;</span><span class="p">)</span>
<span class="n">relabeling</span> <span class="o">=</span> <span class="n">func_or_funcs</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">is_multi_agg_with_relabel</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">if</span> <span class="n">relabeling</span><span class="p">:</span>
<span class="p">(</span>
<span class="n">func_or_funcs</span><span class="p">,</span>
<span class="n">columns</span><span class="p">,</span>
<span class="n">order</span><span class="p">,</span>
<span class="p">)</span> <span class="o">=</span> <span class="n">normalize_keyword_aggregation</span><span class="p">(</span> <span class="c1"># type: ignore[assignment]</span>
<span class="n">kwargs</span>
<span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">func_or_funcs</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="nb">list</span><span class="p">)):</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">func_or_funcs</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">or</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span>
<span class="n">is_name_like_value</span><span class="p">(</span><span class="n">key</span><span class="p">)</span>
<span class="ow">and</span> <span class="p">(</span>
<span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span>
<span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span>
<span class="ow">and</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">value</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">for</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">func_or_funcs</span><span class="o">.</span><span class="n">items</span><span class="p">()</span>
<span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;aggs must be a dict mapping from column name &quot;</span>
<span class="s2">&quot;to aggregate functions (string or list of strings).&quot;</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">agg_cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">col</span><span class="o">.</span><span class="n">name</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">]</span>
<span class="n">func_or_funcs</span> <span class="o">=</span> <span class="p">{</span><span class="n">col</span><span class="p">:</span> <span class="n">func_or_funcs</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">agg_cols</span><span class="p">}</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_groupby</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span> <span class="n">func_or_funcs</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span><span class="p">:</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span>
<span class="n">subset</span><span class="o">=</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span><span class="p">:</span>
<span class="n">index_cols</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="n">should_drop_index</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span>
<span class="n">i</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">gkey</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span> <span class="k">if</span> <span class="n">gkey</span><span class="o">.</span><span class="n">_psdf</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">should_drop_index</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">drop</span> <span class="o">=</span> <span class="ow">not</span> <span class="nb">any</span><span class="p">(</span>
<span class="p">[</span>
<span class="nb">isinstance</span><span class="p">(</span><span class="n">func_or_funcs</span><span class="p">[</span><span class="n">gkey</span><span class="o">.</span><span class="n">name</span><span class="p">],</span> <span class="nb">list</span><span class="p">)</span>
<span class="k">for</span> <span class="n">gkey</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span>
<span class="k">if</span> <span class="n">gkey</span><span class="o">.</span><span class="n">name</span> <span class="ow">in</span> <span class="n">func_or_funcs</span>
<span class="p">]</span>
<span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">level</span><span class="o">=</span><span class="n">should_drop_index</span><span class="p">,</span> <span class="n">drop</span><span class="o">=</span><span class="n">drop</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">should_drop_index</span><span class="p">)</span> <span class="o">&lt;</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">):</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span>
<span class="n">index_cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">c</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> <span class="k">if</span> <span class="n">c</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">index_cols</span><span class="p">]</span>
<span class="k">if</span> <span class="n">relabeling</span><span class="p">:</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">(</span><span class="n">index_cols</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="n">order</span><span class="p">))]</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">([</span><span class="n">c</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">index_cols</span><span class="p">]</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="n">columns</span><span class="p">))</span>
<span class="k">if</span> <span class="n">relabeling</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span><span class="p">:</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">order</span><span class="p">]</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">columns</span> <span class="c1"># type: ignore[assignment]</span>
<span class="k">return</span> <span class="n">psdf</span>
<span class="n">agg</span> <span class="o">=</span> <span class="n">aggregate</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_spark_groupby</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span>
<span class="n">func</span><span class="p">:</span> <span class="n">Mapping</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]],</span>
<span class="n">groupkeys</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Series</span><span class="p">]</span> <span class="o">=</span> <span class="p">(),</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">InternalFrame</span><span class="p">:</span>
<span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">))]</span>
<span class="n">groupkey_scols</span> <span class="o">=</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)]</span>
<span class="n">multi_aggs</span> <span class="o">=</span> <span class="nb">any</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">func</span><span class="o">.</span><span class="n">values</span><span class="p">())</span>
<span class="n">reordered</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">data_columns</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">func</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
<span class="n">label</span> <span class="o">=</span> <span class="n">key</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">key</span><span class="p">,)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">!=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;The length of the key must be the same as the column label level.&quot;</span><span class="p">)</span>
<span class="k">for</span> <span class="n">aggfunc</span> <span class="ow">in</span> <span class="p">[</span><span class="n">value</span><span class="p">]</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="n">value</span><span class="p">:</span>
<span class="n">column_label</span> <span class="o">=</span> <span class="nb">tuple</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">+</span> <span class="p">[</span><span class="n">aggfunc</span><span class="p">])</span> <span class="k">if</span> <span class="n">multi_aggs</span> <span class="k">else</span> <span class="n">label</span>
<span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">column_label</span><span class="p">)</span>
<span class="n">data_col</span> <span class="o">=</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">column_label</span><span class="p">)</span>
<span class="n">data_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">data_col</span><span class="p">)</span>
<span class="n">col_name</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">if</span> <span class="n">aggfunc</span> <span class="o">==</span> <span class="s2">&quot;nunique&quot;</span><span class="p">:</span>
<span class="n">reordered</span><span class="o">.</span><span class="n">append</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">expr</span><span class="p">(</span><span class="s2">&quot;count(DISTINCT `</span><span class="si">{0}</span><span class="s2">`) as `</span><span class="si">{1}</span><span class="s2">`&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">col_name</span><span class="p">,</span> <span class="n">data_col</span><span class="p">))</span>
<span class="p">)</span>
<span class="c1"># Implement &quot;quartiles&quot; aggregate function for ``describe``.</span>
<span class="k">elif</span> <span class="n">aggfunc</span> <span class="o">==</span> <span class="s2">&quot;quartiles&quot;</span><span class="p">:</span>
<span class="n">reordered</span><span class="o">.</span><span class="n">append</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">expr</span><span class="p">(</span>
<span class="s2">&quot;percentile_approx(`</span><span class="si">{0}</span><span class="s2">`, array(0.25, 0.5, 0.75)) as `</span><span class="si">{1}</span><span class="s2">`&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">col_name</span><span class="p">,</span> <span class="n">data_col</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">reordered</span><span class="o">.</span><span class="n">append</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">expr</span><span class="p">(</span><span class="s2">&quot;</span><span class="si">{1}</span><span class="s2">(`</span><span class="si">{0}</span><span class="s2">`) as `</span><span class="si">{2}</span><span class="s2">`&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">col_name</span><span class="p">,</span> <span class="n">aggfunc</span><span class="p">,</span> <span class="n">data_col</span><span class="p">))</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">groupkey_scols</span> <span class="o">+</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">reordered</span><span class="p">)</span>
<span class="k">return</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span>
<span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">groupkeys</span><span class="p">],</span>
<span class="n">index_fields</span><span class="o">=</span><span class="p">[</span>
<span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span>
<span class="p">],</span>
<span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">,</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_columns</span><span class="p">],</span>
<span class="p">)</span>
<div class="viewcode-block" id="GroupBy.count"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.count.html#pyspark.pandas.groupby.GroupBy.count">[docs]</a> <span class="k">def</span> <span class="nf">count</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute count of group, excluding missing values.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [1, 1, 2, 1, 2],</span>
<span class="sd"> ... &#39;B&#39;: [np.nan, 2, 3, 4, 5],</span>
<span class="sd"> ... &#39;C&#39;: [1, 2, 1, 1, 2]}, columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;).count().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B C</span>
<span class="sd"> A</span>
<span class="sd"> 1 2 3</span>
<span class="sd"> 2 2 2</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.first"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.first.html#pyspark.pandas.groupby.GroupBy.first">[docs]</a> <span class="k">def</span> <span class="nf">first</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">min_count</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute first of group values.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> numeric_only : bool, default False</span>
<span class="sd"> Include only float, int, boolean columns. If None, will attempt to use</span>
<span class="sd"> everything, then use only numeric data.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> min_count : int, default -1</span>
<span class="sd"> The required number of valid values to perform the operation. If fewer</span>
<span class="sd"> than ``min_count`` non-NA values are present the result will be NA.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&quot;A&quot;: [1, 2, 1, 2], &quot;B&quot;: [True, False, False, True],</span>
<span class="sd"> ... &quot;C&quot;: [3, 3, 4, 4], &quot;D&quot;: [&quot;a&quot;, &quot;b&quot;, &quot;a&quot;, &quot;a&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B C D</span>
<span class="sd"> 0 1 True 3 a</span>
<span class="sd"> 1 2 False 3 b</span>
<span class="sd"> 2 1 False 4 a</span>
<span class="sd"> 3 2 True 4 a</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).first().sort_index()</span>
<span class="sd"> B C D</span>
<span class="sd"> A</span>
<span class="sd"> 1 True 3 a</span>
<span class="sd"> 2 False 3 b</span>
<span class="sd"> Include only float, int, boolean columns when set numeric_only True.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).first(numeric_only=True).sort_index()</span>
<span class="sd"> B C</span>
<span class="sd"> A</span>
<span class="sd"> 1 True 3</span>
<span class="sd"> 2 False 3</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;D&quot;).first().sort_index()</span>
<span class="sd"> A B C</span>
<span class="sd"> D</span>
<span class="sd"> a 1 True 3</span>
<span class="sd"> b 2 False 3</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;D&quot;).first(min_count=3).sort_index()</span>
<span class="sd"> A B C</span>
<span class="sd"> D</span>
<span class="sd"> a 1.0 True 3.0</span>
<span class="sd"> b NaN None NaN</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">min_count</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;min_count must be integer&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">first</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">ignorenulls</span><span class="o">=</span><span class="kc">True</span><span class="p">),</span>
<span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">)</span> <span class="k">if</span> <span class="n">numeric_only</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">min_count</span><span class="o">=</span><span class="n">min_count</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.last"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.last.html#pyspark.pandas.groupby.GroupBy.last">[docs]</a> <span class="k">def</span> <span class="nf">last</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">min_count</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute last of group values.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> numeric_only : bool, default False</span>
<span class="sd"> Include only float, int, boolean columns. If None, will attempt to use</span>
<span class="sd"> everything, then use only numeric data.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> min_count : int, default -1</span>
<span class="sd"> The required number of valid values to perform the operation. If fewer</span>
<span class="sd"> than ``min_count`` non-NA values are present the result will be NA.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&quot;A&quot;: [1, 2, 1, 2], &quot;B&quot;: [True, False, False, True],</span>
<span class="sd"> ... &quot;C&quot;: [3, 3, 4, 4], &quot;D&quot;: [&quot;a&quot;, &quot;a&quot;, &quot;b&quot;, &quot;a&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B C D</span>
<span class="sd"> 0 1 True 3 a</span>
<span class="sd"> 1 2 False 3 a</span>
<span class="sd"> 2 1 False 4 b</span>
<span class="sd"> 3 2 True 4 a</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).last().sort_index()</span>
<span class="sd"> B C D</span>
<span class="sd"> A</span>
<span class="sd"> 1 False 4 b</span>
<span class="sd"> 2 True 4 a</span>
<span class="sd"> Include only float, int, boolean columns when set numeric_only True.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).last(numeric_only=True).sort_index()</span>
<span class="sd"> B C</span>
<span class="sd"> A</span>
<span class="sd"> 1 False 4</span>
<span class="sd"> 2 True 4</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;D&quot;).last().sort_index()</span>
<span class="sd"> A B C</span>
<span class="sd"> D</span>
<span class="sd"> a 2 True 4</span>
<span class="sd"> b 1 False 4</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;D&quot;).last(min_count=3).sort_index()</span>
<span class="sd"> A B C</span>
<span class="sd"> D</span>
<span class="sd"> a 2.0 True 4.0</span>
<span class="sd"> b NaN None NaN</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">min_count</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;min_count must be integer&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">last</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">ignorenulls</span><span class="o">=</span><span class="kc">True</span><span class="p">),</span>
<span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">)</span> <span class="k">if</span> <span class="n">numeric_only</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">min_count</span><span class="o">=</span><span class="n">min_count</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.max"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.max.html#pyspark.pandas.groupby.GroupBy.max">[docs]</a> <span class="k">def</span> <span class="nf">max</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">min_count</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute max of group values.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> numeric_only : bool, default False</span>
<span class="sd"> Include only float, int, boolean columns. If None, will attempt to use</span>
<span class="sd"> everything, then use only numeric data.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> min_count : bool, default -1</span>
<span class="sd"> The required number of valid values to perform the operation. If fewer</span>
<span class="sd"> than min_count non-NA values are present the result will be NA.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&quot;A&quot;: [1, 2, 1, 2], &quot;B&quot;: [True, False, False, True],</span>
<span class="sd"> ... &quot;C&quot;: [3, 4, 3, 4], &quot;D&quot;: [&quot;a&quot;, &quot;a&quot;, &quot;b&quot;, &quot;a&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).max().sort_index()</span>
<span class="sd"> B C D</span>
<span class="sd"> A</span>
<span class="sd"> 1 True 3 b</span>
<span class="sd"> 2 True 4 a</span>
<span class="sd"> Include only float, int, boolean columns when set numeric_only True.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).max(numeric_only=True).sort_index()</span>
<span class="sd"> B C</span>
<span class="sd"> A</span>
<span class="sd"> 1 True 3</span>
<span class="sd"> 2 True 4</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;D&quot;).max().sort_index()</span>
<span class="sd"> A B C</span>
<span class="sd"> D</span>
<span class="sd"> a 2 True 4</span>
<span class="sd"> b 1 False 3</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;D&quot;).max(min_count=3).sort_index()</span>
<span class="sd"> A B C</span>
<span class="sd"> D</span>
<span class="sd"> a 2.0 True 4.0</span>
<span class="sd"> b NaN None NaN</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">min_count</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;min_count must be integer&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">,</span>
<span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">)</span> <span class="k">if</span> <span class="n">numeric_only</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">min_count</span><span class="o">=</span><span class="n">min_count</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.mean"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.mean.html#pyspark.pandas.groupby.GroupBy.mean">[docs]</a> <span class="k">def</span> <span class="nf">mean</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute mean of groups, excluding missing values.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> numeric_only : bool, default False</span>
<span class="sd"> Include only float, int, boolean columns.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> .. versionchanged:: 4.0.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> pyspark.pandas.Series or pyspark.pandas.DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [1, 1, 2, 1, 2],</span>
<span class="sd"> ... &#39;B&#39;: [np.nan, 2, 3, 4, 5],</span>
<span class="sd"> ... &#39;C&#39;: [1, 2, 1, 1, 2],</span>
<span class="sd"> ... &#39;D&#39;: [True, False, True, False, True]})</span>
<span class="sd"> Groupby one column and return the mean of the remaining columns in</span>
<span class="sd"> each group.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;).mean().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B C D</span>
<span class="sd"> A</span>
<span class="sd"> 1 3.0 1.333333 0.333333</span>
<span class="sd"> 2 4.0 1.500000 1.000000</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_validate_agg_columns</span><span class="p">(</span><span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="n">function_name</span><span class="o">=</span><span class="s2">&quot;median&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">mean</span><span class="p">,</span> <span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,),</span> <span class="n">bool_to_numeric</span><span class="o">=</span><span class="kc">True</span>
<span class="p">)</span></div>
<span class="c1"># TODO: &#39;q&#39; accepts list like type</span>
<div class="viewcode-block" id="GroupBy.quantile"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.quantile.html#pyspark.pandas.groupby.GroupBy.quantile">[docs]</a> <span class="k">def</span> <span class="nf">quantile</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">q</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.5</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10000</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return group values at the given quantile.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> q : float, default 0.5 (50% quantile)</span>
<span class="sd"> Value between 0 and 1 providing the quantile to compute.</span>
<span class="sd"> accuracy : int, optional</span>
<span class="sd"> Default accuracy of approximation. Larger value means better accuracy.</span>
<span class="sd"> The relative error can be deduced by 1.0 / accuracy.</span>
<span class="sd"> This is a panda-on-Spark specific parameter.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> pyspark.pandas.Series or pyspark.pandas.DataFrame</span>
<span class="sd"> Return type determined by caller of GroupBy object.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> `quantile` in pandas-on-Spark are using distributed percentile approximation</span>
<span class="sd"> algorithm unlike pandas, the result might be different with pandas, also</span>
<span class="sd"> `interpolation` parameter is not supported yet.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.quantile</span>
<span class="sd"> pyspark.pandas.DataFrame.quantile</span>
<span class="sd"> pyspark.sql.functions.percentile_approx</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame([</span>
<span class="sd"> ... [&#39;a&#39;, 1], [&#39;a&#39;, 2], [&#39;a&#39;, 3],</span>
<span class="sd"> ... [&#39;b&#39;, 1], [&#39;b&#39;, 3], [&#39;b&#39;, 5]</span>
<span class="sd"> ... ], columns=[&#39;key&#39;, &#39;val&#39;])</span>
<span class="sd"> Groupby one column and return the quantile of the remaining columns in</span>
<span class="sd"> each group.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;key&#39;).quantile()</span>
<span class="sd"> val</span>
<span class="sd"> key</span>
<span class="sd"> a 2.0</span>
<span class="sd"> b 3.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">q</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;q doesn&#39;t support for list like type for now&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_number</span><span class="p">(</span><span class="n">q</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;must be real number, not </span><span class="si">%s</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">q</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="mi">0</span> <span class="o">&lt;=</span> <span class="n">q</span> <span class="o">&lt;=</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;&#39;q&#39; must be between 0 and 1. Got &#39;</span><span class="si">%s</span><span class="s2">&#39; instead&quot;</span> <span class="o">%</span> <span class="n">q</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">_agg_col</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">)</span> <span class="k">for</span> <span class="n">_agg_col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">):</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="sa">f</span><span class="s2">&quot;Allowing bool dtype in </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="si">}</span><span class="s2">.quantile is deprecated &quot;</span>
<span class="s2">&quot;and will raise in a future version, matching the Series/DataFrame behavior. &quot;</span>
<span class="s2">&quot;Cast to uint8 dtype before calling quantile instead.&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">percentile_approx</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">DoubleType</span><span class="p">()),</span> <span class="n">q</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">),</span>
<span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">),</span>
<span class="n">bool_to_numeric</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.min"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.min.html#pyspark.pandas.groupby.GroupBy.min">[docs]</a> <span class="k">def</span> <span class="nf">min</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">min_count</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute min of group values.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> numeric_only : bool, default False</span>
<span class="sd"> Include only float, int, boolean columns. If None, will attempt to use</span>
<span class="sd"> everything, then use only numeric data.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> min_count : bool, default -1</span>
<span class="sd"> The required number of valid values to perform the operation. If fewer</span>
<span class="sd"> than min_count non-NA values are present the result will be NA.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&quot;A&quot;: [1, 2, 1, 2], &quot;B&quot;: [True, False, False, True],</span>
<span class="sd"> ... &quot;C&quot;: [3, 4, 3, 4], &quot;D&quot;: [&quot;a&quot;, &quot;a&quot;, &quot;b&quot;, &quot;a&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).min().sort_index()</span>
<span class="sd"> B C D</span>
<span class="sd"> A</span>
<span class="sd"> 1 False 3 a</span>
<span class="sd"> 2 False 4 a</span>
<span class="sd"> Include only float, int, boolean columns when set numeric_only True.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).min(numeric_only=True).sort_index()</span>
<span class="sd"> B C</span>
<span class="sd"> A</span>
<span class="sd"> 1 False 3</span>
<span class="sd"> 2 False 4</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;D&quot;).min().sort_index()</span>
<span class="sd"> A B C</span>
<span class="sd"> D</span>
<span class="sd"> a 1 False 3</span>
<span class="sd"> b 1 False 3</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;D&quot;).min(min_count=3).sort_index()</span>
<span class="sd"> A B C</span>
<span class="sd"> D</span>
<span class="sd"> a 1.0 False 3.0</span>
<span class="sd"> b NaN None NaN</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">min_count</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;min_count must be integer&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">,</span>
<span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">)</span> <span class="k">if</span> <span class="n">numeric_only</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">min_count</span><span class="o">=</span><span class="n">min_count</span><span class="p">,</span>
<span class="p">)</span></div>
<span class="c1"># TODO: sync the doc.</span>
<div class="viewcode-block" id="GroupBy.std"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.std.html#pyspark.pandas.groupby.GroupBy.std">[docs]</a> <span class="k">def</span> <span class="nf">std</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ddof</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute standard deviation of groups, excluding missing values.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> ddof : int, default 1</span>
<span class="sd"> Delta Degrees of Freedom. The divisor used in calculations is N - ddof,</span>
<span class="sd"> where N represents the number of elements.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supported including arbitary integers.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&quot;A&quot;: [1, 2, 1, 2], &quot;B&quot;: [True, False, False, True],</span>
<span class="sd"> ... &quot;C&quot;: [3, 4, 3, 4], &quot;D&quot;: [&quot;a&quot;, &quot;b&quot;, &quot;b&quot;, &quot;a&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).std()</span>
<span class="sd"> B C</span>
<span class="sd"> A</span>
<span class="sd"> 1 0.707107 0.0</span>
<span class="sd"> 2 0.707107 0.0</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ddof</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;ddof must be integer&quot;</span><span class="p">)</span>
<span class="c1"># Raise the TypeError when all aggregation columns are of unaccepted data types</span>
<span class="n">any_accepted</span> <span class="o">=</span> <span class="nb">any</span><span class="p">(</span>
<span class="nb">isinstance</span><span class="p">(</span><span class="n">_agg_col</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">))</span>
<span class="k">for</span> <span class="n">_agg_col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span>
<span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">any_accepted</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Unaccepted data types of aggregation columns; numeric or bool expected.&quot;</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">std</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="k">return</span> <span class="n">SF</span><span class="o">.</span><span class="n">stddev</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">ddof</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">std</span><span class="p">,</span>
<span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,),</span>
<span class="n">bool_to_numeric</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.sum"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.sum.html#pyspark.pandas.groupby.GroupBy.sum">[docs]</a> <span class="k">def</span> <span class="nf">sum</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">min_count</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute sum of group values</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> numeric_only : bool, default False</span>
<span class="sd"> Include only float, int, boolean columns.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> .. versionchanged:: 4.0.0</span>
<span class="sd"> min_count : int, default 0</span>
<span class="sd"> The required number of valid values to perform the operation.</span>
<span class="sd"> If fewer than min_count non-NA values are present the result will be NA.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&quot;A&quot;: [1, 2, 1, 2], &quot;B&quot;: [True, False, False, True],</span>
<span class="sd"> ... &quot;C&quot;: [3, 4, 3, 4], &quot;D&quot;: [&quot;a&quot;, &quot;a&quot;, &quot;b&quot;, &quot;a&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).sum().sort_index()</span>
<span class="sd"> B C D</span>
<span class="sd"> A</span>
<span class="sd"> 1 1 6 ab</span>
<span class="sd"> 2 1 8 aa</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;D&quot;).sum().sort_index()</span>
<span class="sd"> A B C</span>
<span class="sd"> D</span>
<span class="sd"> a 5 2 11</span>
<span class="sd"> b 1 0 3</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;D&quot;).sum(min_count=3).sort_index()</span>
<span class="sd"> A B C</span>
<span class="sd"> D</span>
<span class="sd"> a 5.0 2.0 11.0</span>
<span class="sd"> b NaN NaN NaN</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> There is a behavior difference between pandas-on-Spark and pandas:</span>
<span class="sd"> * when there is a non-numeric aggregation column, it will be ignored</span>
<span class="sd"> even if `numeric_only` is False.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">numeric_only</span><span class="p">,</span> <span class="nb">bool</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;numeric_only must be None or bool&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">min_count</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;min_count must be integer&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">numeric_only</span><span class="p">:</span>
<span class="n">unsupported</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">col</span><span class="o">.</span><span class="n">name</span>
<span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">,</span> <span class="n">StringType</span><span class="p">))</span>
<span class="p">]</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">unsupported</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;GroupBy.sum() can only support numeric, bool and string columns even if&quot;</span>
<span class="sa">f</span><span class="s2">&quot;numeric_only=False, skip unsupported columns: </span><span class="si">{</span><span class="n">unsupported</span><span class="si">}</span><span class="s2">&quot;</span>
<span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">sum</span><span class="p">,</span>
<span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">,</span> <span class="n">StringType</span><span class="p">),</span>
<span class="n">bool_to_numeric</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">min_count</span><span class="o">=</span><span class="n">min_count</span><span class="p">,</span>
<span class="p">)</span></div>
<span class="c1"># TODO: sync the doc.</span>
<div class="viewcode-block" id="GroupBy.var"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.var.html#pyspark.pandas.groupby.GroupBy.var">[docs]</a> <span class="k">def</span> <span class="nf">var</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ddof</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute variance of groups, excluding missing values.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> ddof : int, default 1</span>
<span class="sd"> Delta Degrees of Freedom. The divisor used in calculations is N - ddof,</span>
<span class="sd"> where N represents the number of elements.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supported including arbitary integers.</span>
<span class="sd"> numeric_only : bool, default False</span>
<span class="sd"> Include only float, int, boolean columns.</span>
<span class="sd"> .. versionadded:: 4.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&quot;A&quot;: [1, 2, 1, 2], &quot;B&quot;: [True, False, False, True],</span>
<span class="sd"> ... &quot;C&quot;: [3, 4, 3, 4], &quot;D&quot;: [&quot;a&quot;, &quot;b&quot;, &quot;b&quot;, &quot;a&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).var()</span>
<span class="sd"> B C</span>
<span class="sd"> A</span>
<span class="sd"> 1 0.5 0.0</span>
<span class="sd"> 2 0.5 0.0</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ddof</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;ddof must be integer&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">var</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="k">return</span> <span class="n">SF</span><span class="o">.</span><span class="n">var</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">ddof</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">var</span><span class="p">,</span>
<span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,),</span>
<span class="n">bool_to_numeric</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span>
<span class="p">)</span></div>
<span class="k">def</span> <span class="nf">skew</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute skewness of groups, excluding missing values.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&quot;A&quot;: [1, 2, 1, 1], &quot;B&quot;: [True, False, False, True],</span>
<span class="sd"> ... &quot;C&quot;: [3, 4, 3, 4], &quot;D&quot;: [&quot;a&quot;, &quot;b&quot;, &quot;b&quot;, &quot;a&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).skew()</span>
<span class="sd"> B C</span>
<span class="sd"> A</span>
<span class="sd"> 1 -1.732051 1.732051</span>
<span class="sd"> 2 NaN NaN</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">SF</span><span class="o">.</span><span class="n">skew</span><span class="p">,</span>
<span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,),</span>
<span class="n">bool_to_numeric</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span>
<div class="viewcode-block" id="GroupBy.sem"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.sem.html#pyspark.pandas.groupby.GroupBy.sem">[docs]</a> <span class="k">def</span> <span class="nf">sem</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ddof</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute standard error of the mean of groups, excluding missing values.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> ddof : int, default 1</span>
<span class="sd"> Delta Degrees of Freedom. The divisor used in calculations is N - ddof,</span>
<span class="sd"> where N represents the number of elements.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&quot;A&quot;: [1, 2, 1, 1], &quot;B&quot;: [True, False, False, True],</span>
<span class="sd"> ... &quot;C&quot;: [3, None, 3, 4], &quot;D&quot;: [&quot;a&quot;, &quot;b&quot;, &quot;b&quot;, &quot;a&quot;]})</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).sem()</span>
<span class="sd"> B C</span>
<span class="sd"> A</span>
<span class="sd"> 1 0.333333 0.333333</span>
<span class="sd"> 2 NaN NaN</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;D&quot;).sem(ddof=1)</span>
<span class="sd"> A B C</span>
<span class="sd"> D</span>
<span class="sd"> a 0.0 0.0 0.5</span>
<span class="sd"> b 0.5 0.0 NaN</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).sem()</span>
<span class="sd"> A</span>
<span class="sd"> 1 0.333333</span>
<span class="sd"> 2 NaN</span>
<span class="sd"> Name: B, dtype: float64</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.sem</span>
<span class="sd"> pyspark.pandas.DataFrame.sem</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ddof</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;ddof must be integer&quot;</span><span class="p">)</span>
<span class="c1"># Raise the TypeError when all aggregation columns are of unaccepted data types</span>
<span class="n">any_accepted</span> <span class="o">=</span> <span class="nb">any</span><span class="p">(</span>
<span class="nb">isinstance</span><span class="p">(</span><span class="n">_agg_col</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">))</span>
<span class="k">for</span> <span class="n">_agg_col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span>
<span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">any_accepted</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Unaccepted data types of aggregation columns; numeric or bool expected.&quot;</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">sem</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="k">return</span> <span class="n">SF</span><span class="o">.</span><span class="n">stddev</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">ddof</span><span class="p">)</span> <span class="o">/</span> <span class="n">F</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">col</span><span class="p">))</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">sem</span><span class="p">,</span>
<span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">),</span>
<span class="n">bool_to_numeric</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span></div>
<span class="c1"># TODO: 1, &#39;n&#39; accepts list and slice; 2, implement &#39;dropna&#39; parameter</span>
<div class="viewcode-block" id="GroupBy.nth"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.nth.html#pyspark.pandas.groupby.GroupBy.nth">[docs]</a> <span class="k">def</span> <span class="nf">nth</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Take the nth row from each group.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> n : int</span>
<span class="sd"> A single nth value for the row</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or DataFrame</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> There is a behavior difference between pandas-on-Spark and pandas:</span>
<span class="sd"> * when there is no aggregation column, and `n` not equal to 0 or -1,</span>
<span class="sd"> the returned empty dataframe may have an index with different lenght `__len__`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import numpy as np</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [1, 1, 2, 1, 2],</span>
<span class="sd"> ... &#39;B&#39;: [np.nan, 2, 3, 4, 5]}, columns=[&#39;A&#39;, &#39;B&#39;])</span>
<span class="sd"> &gt;&gt;&gt; g = df.groupby(&#39;A&#39;)</span>
<span class="sd"> &gt;&gt;&gt; g.nth(0)</span>
<span class="sd"> A B</span>
<span class="sd"> 0 1 NaN</span>
<span class="sd"> 2 2 3.0</span>
<span class="sd"> &gt;&gt;&gt; g.nth(1)</span>
<span class="sd"> A B</span>
<span class="sd"> 1 1 2.0</span>
<span class="sd"> 4 2 5.0</span>
<span class="sd"> &gt;&gt;&gt; g.nth(-1)</span>
<span class="sd"> A B</span>
<span class="sd"> 3 1 4.0</span>
<span class="sd"> 4 2 5.0</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="nb">slice</span><span class="p">)</span> <span class="ow">or</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">n</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">&quot;n doesn&#39;t support slice or list for now&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;Invalid index </span><span class="si">%s</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">n</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="n">groupkey_names</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="nb">str</span><span class="p">(</span><span class="n">groupkey</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">groupkey</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">]</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">window1</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span>
<span class="n">tmp_row_number_col</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">&quot;__tmp_row_number_col__&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">n</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span>
<span class="n">tmp_row_number_col</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window1</span><span class="p">)</span>
<span class="p">)</span>
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">tmp_row_number_col</span><span class="p">)</span> <span class="o">==</span> <span class="n">n</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span>
<span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">tmp_row_number_col</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">window2</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span>
<span class="n">Window</span><span class="o">.</span><span class="n">unboundedPreceding</span><span class="p">,</span> <span class="n">Window</span><span class="o">.</span><span class="n">unboundedFollowing</span>
<span class="p">)</span>
<span class="n">tmp_group_size_col</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">&quot;__tmp_group_size_col__&quot;</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span>
<span class="n">tmp_group_size_col</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window2</span><span class="p">)</span>
<span class="p">)</span>
<span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">tmp_row_number_col</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window1</span><span class="p">))</span>
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">tmp_row_number_col</span><span class="p">)</span> <span class="o">==</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">tmp_group_size_col</span><span class="p">)</span> <span class="o">+</span> <span class="mi">1</span> <span class="o">+</span> <span class="n">n</span><span class="p">)</span>
<span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">tmp_group_size_col</span><span class="p">,</span> <span class="n">tmp_row_number_col</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">distinct</span><span class="p">()</span>
<span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span><span class="p">:</span>
<span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">:</span>
<span class="n">agg_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span>
<span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">:</span>
<span class="n">agg_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_column_names</span><span class="p">],</span>
<span class="n">index_names</span><span class="o">=</span><span class="n">internal</span><span class="o">.</span><span class="n">index_names</span><span class="p">,</span>
<span class="n">index_fields</span><span class="o">=</span><span class="n">internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">,</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span>
<span class="p">],</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">],</span>
<span class="n">column_label_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">agg_column_names</span> <span class="o">=</span> <span class="p">(</span>
<span class="p">[</span><span class="nb">str</span><span class="p">(</span><span class="n">agg_column</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">agg_column</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">]</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span>
<span class="k">else</span> <span class="kc">None</span>
<span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prepare_return</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">),</span> <span class="n">agg_column_names</span><span class="o">=</span><span class="n">agg_column_names</span><span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.prod"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.prod.html#pyspark.pandas.groupby.GroupBy.prod">[docs]</a> <span class="k">def</span> <span class="nf">prod</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">min_count</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute prod of groups.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> numeric_only : bool, default False</span>
<span class="sd"> Include only float, int, boolean columns.</span>
<span class="sd"> .. versionchanged:: 4.0.0</span>
<span class="sd"> min_count : int, default 0</span>
<span class="sd"> The required number of valid values to perform the operation.</span>
<span class="sd"> If fewer than min_count non-NA values are present the result will be NA.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or DataFrame</span>
<span class="sd"> Computed prod of values within each group.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import numpy as np</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame(</span>
<span class="sd"> ... {</span>
<span class="sd"> ... &quot;A&quot;: [1, 1, 2, 1, 2],</span>
<span class="sd"> ... &quot;B&quot;: [np.nan, 2, 3, 4, 5],</span>
<span class="sd"> ... &quot;C&quot;: [1, 2, 1, 1, 2],</span>
<span class="sd"> ... &quot;D&quot;: [True, False, True, False, True],</span>
<span class="sd"> ... }</span>
<span class="sd"> ... )</span>
<span class="sd"> Groupby one column and return the prod of the remaining columns in</span>
<span class="sd"> each group.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;).prod().sort_index()</span>
<span class="sd"> B C D</span>
<span class="sd"> A</span>
<span class="sd"> 1 8.0 2 0</span>
<span class="sd"> 2 15.0 2 1</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;).prod(min_count=3).sort_index()</span>
<span class="sd"> B C D</span>
<span class="sd"> A</span>
<span class="sd"> 1 NaN 2.0 0.0</span>
<span class="sd"> 2 NaN NaN NaN</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">min_count</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;min_count must be integer&quot;</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_validate_agg_columns</span><span class="p">(</span><span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="n">function_name</span><span class="o">=</span><span class="s2">&quot;prod&quot;</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">SF</span><span class="o">.</span><span class="n">product</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="kc">True</span><span class="p">),</span>
<span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">),</span>
<span class="n">bool_to_numeric</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">min_count</span><span class="o">=</span><span class="n">min_count</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.all"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.all.html#pyspark.pandas.groupby.GroupBy.all">[docs]</a> <span class="k">def</span> <span class="nf">all</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns True if all values in the group are truthful, else False.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> skipna : bool, default True</span>
<span class="sd"> Flag to ignore NA(nan/null) values during truth testing.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],</span>
<span class="sd"> ... &#39;B&#39;: [True, True, True, False, False,</span>
<span class="sd"> ... False, None, True, None, False]},</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B</span>
<span class="sd"> 0 1 True</span>
<span class="sd"> 1 1 True</span>
<span class="sd"> 2 2 True</span>
<span class="sd"> 3 2 False</span>
<span class="sd"> 4 3 False</span>
<span class="sd"> 5 3 False</span>
<span class="sd"> 6 4 None</span>
<span class="sd"> 7 4 True</span>
<span class="sd"> 8 5 None</span>
<span class="sd"> 9 5 False</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;).all().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B</span>
<span class="sd"> A</span>
<span class="sd"> 1 True</span>
<span class="sd"> 2 False</span>
<span class="sd"> 3 False</span>
<span class="sd"> 4 True</span>
<span class="sd"> 5 False</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;).all(skipna=False).sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B</span>
<span class="sd"> A</span>
<span class="sd"> 1 True</span>
<span class="sd"> 2 False</span>
<span class="sd"> 3 False</span>
<span class="sd"> 4 False</span>
<span class="sd"> 5 False</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span>
<span class="n">internal</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prepare_reduce</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">sfun</span><span class="p">(</span><span class="n">scol</span><span class="p">:</span> <span class="n">Column</span><span class="p">,</span> <span class="n">scol_type</span><span class="p">:</span> <span class="n">DataType</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">scol_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">)</span> <span class="ow">or</span> <span class="n">skipna</span><span class="p">:</span>
<span class="c1"># np.nan takes no effect to the result; None takes no effect if `skipna`</span>
<span class="n">all_col</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">&quot;boolean&quot;</span><span class="p">),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">True</span><span class="p">)))</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># Take None as False when not `skipna`</span>
<span class="n">all_col</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">))</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">&quot;boolean&quot;</span><span class="p">)))</span>
<span class="k">return</span> <span class="n">all_col</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">stat_exprs</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span>
<span class="n">psser</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="n">stat_exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span>
<span class="n">sfun</span><span class="p">(</span>
<span class="n">psser</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">nan_to_null</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span>
<span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">stat_exprs</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">distinct</span><span class="p">()</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prepare_return</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div>
<span class="c1"># TODO: skipna should be implemented.</span>
<div class="viewcode-block" id="GroupBy.any"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.any.html#pyspark.pandas.groupby.GroupBy.any">[docs]</a> <span class="k">def</span> <span class="nf">any</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns True if any value in the group is truthful, else False.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],</span>
<span class="sd"> ... &#39;B&#39;: [True, True, True, False, False,</span>
<span class="sd"> ... False, None, True, None, False]},</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B</span>
<span class="sd"> 0 1 True</span>
<span class="sd"> 1 1 True</span>
<span class="sd"> 2 2 True</span>
<span class="sd"> 3 2 False</span>
<span class="sd"> 4 3 False</span>
<span class="sd"> 5 3 False</span>
<span class="sd"> 6 4 None</span>
<span class="sd"> 7 4 True</span>
<span class="sd"> 8 5 None</span>
<span class="sd"> 9 5 False</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;).any().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B</span>
<span class="sd"> A</span>
<span class="sd"> 1 True</span>
<span class="sd"> 2 True</span>
<span class="sd"> 3 False</span>
<span class="sd"> 4 True</span>
<span class="sd"> 5 False</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">&quot;boolean&quot;</span><span class="p">),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">)))</span>
<span class="p">)</span></div>
<span class="c1"># TODO: groupby multiply columns should be implemented.</span>
<div class="viewcode-block" id="GroupBy.size"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.size.html#pyspark.pandas.groupby.GroupBy.size">[docs]</a> <span class="k">def</span> <span class="nf">size</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Series</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute group sizes.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [1, 2, 2, 3, 3, 3],</span>
<span class="sd"> ... &#39;B&#39;: [1, 1, 2, 3, 3, 3]},</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B</span>
<span class="sd"> 0 1 1</span>
<span class="sd"> 1 2 1</span>
<span class="sd"> 2 2 2</span>
<span class="sd"> 3 3 3</span>
<span class="sd"> 4 3 3</span>
<span class="sd"> 5 3 3</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;).size().sort_index()</span>
<span class="sd"> A</span>
<span class="sd"> 1 1</span>
<span class="sd"> 2 2</span>
<span class="sd"> 3 3</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;A&#39;, &#39;B&#39;]).size().sort_index()</span>
<span class="sd"> A B</span>
<span class="sd"> 1 1 1</span>
<span class="sd"> 2 1 1</span>
<span class="sd"> 2 1</span>
<span class="sd"> 3 3 3</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> For Series,</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).size().sort_index()</span>
<span class="sd"> A</span>
<span class="sd"> 1 1</span>
<span class="sd"> 2 2</span>
<span class="sd"> 3 3</span>
<span class="sd"> Name: B, dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(df.A).B.size().sort_index()</span>
<span class="sd"> A</span>
<span class="sd"> 1 1</span>
<span class="sd"> 2 2</span>
<span class="sd"> 3 3</span>
<span class="sd"> Name: B, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">groupkeys</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span>
<span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">))]</span>
<span class="n">groupkey_scols</span> <span class="o">=</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="n">groupkey_scols</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span>
<span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">groupkeys</span><span class="p">],</span>
<span class="n">index_fields</span><span class="o">=</span><span class="p">[</span>
<span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span>
<span class="p">],</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="kc">None</span><span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">&quot;count&quot;</span><span class="p">)],</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div>
<div class="viewcode-block" id="GroupBy.diff"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.diff.html#pyspark.pandas.groupby.GroupBy.diff">[docs]</a> <span class="k">def</span> <span class="nf">diff</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> First discrete difference of element.</span>
<span class="sd"> Calculates the difference of a DataFrame element compared with another element in the</span>
<span class="sd"> DataFrame group (default is the element in the same column of the previous row).</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> periods : int, default 1</span>
<span class="sd"> Periods to shift for calculating difference, accepts negative values.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> diffed : DataFrame or Series</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 2, 3, 4, 5, 6],</span>
<span class="sd"> ... &#39;b&#39;: [1, 1, 2, 3, 5, 8],</span>
<span class="sd"> ... &#39;c&#39;: [1, 4, 9, 16, 25, 36]}, columns=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> a b c</span>
<span class="sd"> 0 1 1 1</span>
<span class="sd"> 1 2 1 4</span>
<span class="sd"> 2 3 2 9</span>
<span class="sd"> 3 4 3 16</span>
<span class="sd"> 4 5 5 25</span>
<span class="sd"> 5 6 8 36</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;b&#39;]).diff().sort_index()</span>
<span class="sd"> a c</span>
<span class="sd"> 0 NaN NaN</span>
<span class="sd"> 1 1.0 3.0</span>
<span class="sd"> 2 NaN NaN</span>
<span class="sd"> 3 NaN NaN</span>
<span class="sd"> 4 NaN NaN</span>
<span class="sd"> 5 NaN NaN</span>
<span class="sd"> Difference with previous column in a group.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;b&#39;])[&#39;a&#39;].diff().sort_index()</span>
<span class="sd"> 0 NaN</span>
<span class="sd"> 1 1.0</span>
<span class="sd"> 2 NaN</span>
<span class="sd"> 3 NaN</span>
<span class="sd"> 4 NaN</span>
<span class="sd"> 5 NaN</span>
<span class="sd"> Name: a, dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_diff</span><span class="p">(</span><span class="n">periods</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span> <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.cumcount"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.cumcount.html#pyspark.pandas.groupby.GroupBy.cumcount">[docs]</a> <span class="k">def</span> <span class="nf">cumcount</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Series</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Number each item in each group from 0 to the length of that group - 1.</span>
<span class="sd"> Essentially this is equivalent to</span>
<span class="sd"> .. code-block:: python</span>
<span class="sd"> self.apply(lambda x: pd.Series(np.arange(len(x)), x.index))</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> ascending : bool, default True</span>
<span class="sd"> If False, number in reverse, from length of group - 1 to 0.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series</span>
<span class="sd"> Sequence number of each element within each group.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame([[&#39;a&#39;], [&#39;a&#39;], [&#39;a&#39;], [&#39;b&#39;], [&#39;b&#39;], [&#39;a&#39;]],</span>
<span class="sd"> ... columns=[&#39;A&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A</span>
<span class="sd"> 0 a</span>
<span class="sd"> 1 a</span>
<span class="sd"> 2 a</span>
<span class="sd"> 3 b</span>
<span class="sd"> 4 b</span>
<span class="sd"> 5 a</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;).cumcount().sort_index()</span>
<span class="sd"> 0 0</span>
<span class="sd"> 1 1</span>
<span class="sd"> 2 2</span>
<span class="sd"> 3 0</span>
<span class="sd"> 4 1</span>
<span class="sd"> 5 3</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;).cumcount(ascending=False).sort_index()</span>
<span class="sd"> 0 3</span>
<span class="sd"> 1 2</span>
<span class="sd"> 2 1</span>
<span class="sd"> 3 1</span>
<span class="sd"> 4 0</span>
<span class="sd"> 5 0</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">ret</span> <span class="o">=</span> <span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="o">.</span><span class="n">rename</span><span class="p">()</span>
<span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="k">lambda</span> <span class="n">_</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span>
<span class="o">.</span><span class="n">_cum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">,</span> <span class="n">ascending</span><span class="o">=</span><span class="n">ascending</span><span class="p">)</span>
<span class="o">-</span> <span class="mi">1</span>
<span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">ret</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span>
<span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div>
<div class="viewcode-block" id="GroupBy.cummax"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.cummax.html#pyspark.pandas.groupby.GroupBy.cummax">[docs]</a> <span class="k">def</span> <span class="nf">cummax</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Cumulative max for each group.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.cummax</span>
<span class="sd"> DataFrame.cummax</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame(</span>
<span class="sd"> ... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],</span>
<span class="sd"> ... columns=list(&#39;ABC&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B C</span>
<span class="sd"> 0 1 NaN 4</span>
<span class="sd"> 1 1 0.1 3</span>
<span class="sd"> 2 1 20.0 2</span>
<span class="sd"> 3 4 10.0 1</span>
<span class="sd"> By default, iterates over rows and finds the sum in each column.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).cummax().sort_index()</span>
<span class="sd"> B C</span>
<span class="sd"> 0 NaN 4</span>
<span class="sd"> 1 0.1 4</span>
<span class="sd"> 2 20.0 4</span>
<span class="sd"> 3 10.0 1</span>
<span class="sd"> It works as below in Series.</span>
<span class="sd"> &gt;&gt;&gt; df.C.groupby(df.A).cummax().sort_index()</span>
<span class="sd"> 0 4</span>
<span class="sd"> 1 4</span>
<span class="sd"> 2 4</span>
<span class="sd"> 3 1</span>
<span class="sd"> Name: C, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_cum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span>
<span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.cummin"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.cummin.html#pyspark.pandas.groupby.GroupBy.cummin">[docs]</a> <span class="k">def</span> <span class="nf">cummin</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Cumulative min for each group.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.cummin</span>
<span class="sd"> DataFrame.cummin</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame(</span>
<span class="sd"> ... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],</span>
<span class="sd"> ... columns=list(&#39;ABC&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B C</span>
<span class="sd"> 0 1 NaN 4</span>
<span class="sd"> 1 1 0.1 3</span>
<span class="sd"> 2 1 20.0 2</span>
<span class="sd"> 3 4 10.0 1</span>
<span class="sd"> By default, iterates over rows and finds the sum in each column.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).cummin().sort_index()</span>
<span class="sd"> B C</span>
<span class="sd"> 0 NaN 4</span>
<span class="sd"> 1 0.1 3</span>
<span class="sd"> 2 0.1 2</span>
<span class="sd"> 3 10.0 1</span>
<span class="sd"> It works as below in Series.</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).cummin().sort_index()</span>
<span class="sd"> 0 NaN</span>
<span class="sd"> 1 0.1</span>
<span class="sd"> 2 0.1</span>
<span class="sd"> 3 10.0</span>
<span class="sd"> Name: B, dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_cum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span>
<span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.cumprod"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.cumprod.html#pyspark.pandas.groupby.GroupBy.cumprod">[docs]</a> <span class="k">def</span> <span class="nf">cumprod</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Cumulative product for each group.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.cumprod</span>
<span class="sd"> DataFrame.cumprod</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame(</span>
<span class="sd"> ... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],</span>
<span class="sd"> ... columns=list(&#39;ABC&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B C</span>
<span class="sd"> 0 1 NaN 4</span>
<span class="sd"> 1 1 0.1 3</span>
<span class="sd"> 2 1 20.0 2</span>
<span class="sd"> 3 4 10.0 1</span>
<span class="sd"> By default, iterates over rows and finds the sum in each column.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).cumprod().sort_index()</span>
<span class="sd"> B C</span>
<span class="sd"> 0 NaN 4</span>
<span class="sd"> 1 0.1 12</span>
<span class="sd"> 2 2.0 24</span>
<span class="sd"> 3 10.0 1</span>
<span class="sd"> It works as below in Series.</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).cumprod().sort_index()</span>
<span class="sd"> 0 NaN</span>
<span class="sd"> 1 0.1</span>
<span class="sd"> 2 2.0</span>
<span class="sd"> 3 10.0</span>
<span class="sd"> Name: B, dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_cumprod</span><span class="p">(</span><span class="kc">True</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span>
<span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.cumsum"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.cumsum.html#pyspark.pandas.groupby.GroupBy.cumsum">[docs]</a> <span class="k">def</span> <span class="nf">cumsum</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Cumulative sum for each group.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.cumsum</span>
<span class="sd"> DataFrame.cumsum</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame(</span>
<span class="sd"> ... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],</span>
<span class="sd"> ... columns=list(&#39;ABC&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B C</span>
<span class="sd"> 0 1 NaN 4</span>
<span class="sd"> 1 1 0.1 3</span>
<span class="sd"> 2 1 20.0 2</span>
<span class="sd"> 3 4 10.0 1</span>
<span class="sd"> By default, iterates over rows and finds the sum in each column.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).cumsum().sort_index()</span>
<span class="sd"> B C</span>
<span class="sd"> 0 NaN 4</span>
<span class="sd"> 1 0.1 7</span>
<span class="sd"> 2 20.1 9</span>
<span class="sd"> 3 10.0 1</span>
<span class="sd"> It works as below in Series.</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).cumsum().sort_index()</span>
<span class="sd"> 0 NaN</span>
<span class="sd"> 1 0.1</span>
<span class="sd"> 2 20.1</span>
<span class="sd"> 3 10.0</span>
<span class="sd"> Name: B, dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_cumsum</span><span class="p">(</span><span class="kc">True</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span>
<span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.apply"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.apply.html#pyspark.pandas.groupby.GroupBy.apply">[docs]</a> <span class="k">def</span> <span class="nf">apply</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Apply function `func` group-wise and combine the results together.</span>
<span class="sd"> The function passed to `apply` must take a DataFrame as its first</span>
<span class="sd"> argument and return a DataFrame. `apply` will</span>
<span class="sd"> then take care of combining the results back together into a single</span>
<span class="sd"> dataframe. `apply` is therefore a highly flexible</span>
<span class="sd"> grouping method.</span>
<span class="sd"> While `apply` is a very flexible method, its downside is that</span>
<span class="sd"> using it can be quite a bit slower than using more specific methods</span>
<span class="sd"> like `agg` or `transform`. pandas-on-Spark offers a wide range of method that will</span>
<span class="sd"> be much faster than using `apply` for their specific purposes, so try to</span>
<span class="sd"> use them before reaching for `apply`.</span>
<span class="sd"> .. note:: this API executes the function once to infer the type which is</span>
<span class="sd"> potentially expensive, for instance, when the dataset is created after</span>
<span class="sd"> aggregations or sorting.</span>
<span class="sd"> To avoid this, specify return type in ``func``, for instance, as below:</span>
<span class="sd"> &gt;&gt;&gt; def pandas_div(x) -&gt; ps.DataFrame[int, [float, float]]:</span>
<span class="sd"> ... return x[[&#39;B&#39;, &#39;C&#39;]] / x[[&#39;B&#39;, &#39;C&#39;]]</span>
<span class="sd"> If the return type is specified, the output column names become</span>
<span class="sd"> `c0, c1, c2 ... cn`. These names are positionally mapped to the returned</span>
<span class="sd"> DataFrame in ``func``.</span>
<span class="sd"> To specify the column names, you can assign them in a NumPy compound type style</span>
<span class="sd"> as below:</span>
<span class="sd"> &gt;&gt;&gt; def pandas_div(x) -&gt; ps.DataFrame[(&quot;index&quot;, int), [(&quot;a&quot;, float), (&quot;b&quot;, float)]]:</span>
<span class="sd"> ... return x[[&#39;B&#39;, &#39;C&#39;]] / x[[&#39;B&#39;, &#39;C&#39;]]</span>
<span class="sd"> &gt;&gt;&gt; pdf = pd.DataFrame({&#39;B&#39;: [1.], &#39;C&#39;: [3.]})</span>
<span class="sd"> &gt;&gt;&gt; def plus_one(x) -&gt; ps.DataFrame[</span>
<span class="sd"> ... (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]:</span>
<span class="sd"> ... return x[[&#39;B&#39;, &#39;C&#39;]] / x[[&#39;B&#39;, &#39;C&#39;]]</span>
<span class="sd"> .. note:: the dataframe within ``func`` is actually a pandas dataframe. Therefore,</span>
<span class="sd"> any pandas API within this function is allowed.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> func : callable</span>
<span class="sd"> A callable that takes a DataFrame as its first argument, and</span>
<span class="sd"> returns a dataframe.</span>
<span class="sd"> *args</span>
<span class="sd"> Positional arguments to pass to func.</span>
<span class="sd"> **kwargs</span>
<span class="sd"> Keyword arguments to pass to func.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> applied : DataFrame or Series</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> aggregate : Apply aggregate function to the GroupBy object.</span>
<span class="sd"> DataFrame.apply : Apply a function to a DataFrame.</span>
<span class="sd"> Series.apply : Apply a function to a Series.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: &#39;a a b&#39;.split(),</span>
<span class="sd"> ... &#39;B&#39;: [1, 2, 3],</span>
<span class="sd"> ... &#39;C&#39;: [4, 6, 5]}, columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;])</span>
<span class="sd"> &gt;&gt;&gt; g = df.groupby(&#39;A&#39;)</span>
<span class="sd"> Notice that ``g`` has two groups, ``a`` and ``b``.</span>
<span class="sd"> Calling `apply` in various ways, we can get different grouping results:</span>
<span class="sd"> Below the functions passed to `apply` takes a DataFrame as</span>
<span class="sd"> its argument and returns a DataFrame. `apply` combines the result for</span>
<span class="sd"> each group together into a new DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; def plus_min(x):</span>
<span class="sd"> ... return x + x.min()</span>
<span class="sd"> &gt;&gt;&gt; g.apply(plus_min).sort_index() # doctest: +SKIP</span>
<span class="sd"> A B C</span>
<span class="sd"> 0 aa 2 8</span>
<span class="sd"> 1 aa 3 10</span>
<span class="sd"> 2 bb 6 10</span>
<span class="sd"> &gt;&gt;&gt; g.apply(sum).sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> A B C</span>
<span class="sd"> A</span>
<span class="sd"> a aa 3 10</span>
<span class="sd"> b b 3 5</span>
<span class="sd"> &gt;&gt;&gt; g.apply(len).sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> A</span>
<span class="sd"> a 2</span>
<span class="sd"> b 1</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> You can specify the type hint and prevent schema inference for better performance.</span>
<span class="sd"> &gt;&gt;&gt; def pandas_div(x) -&gt; ps.DataFrame[int, [float, float]]:</span>
<span class="sd"> ... return x[[&#39;B&#39;, &#39;C&#39;]] / x[[&#39;B&#39;, &#39;C&#39;]]</span>
<span class="sd"> &gt;&gt;&gt; g.apply(pandas_div).sort_index() # doctest: +SKIP</span>
<span class="sd"> c0 c1</span>
<span class="sd"> 0 1.0 1.0</span>
<span class="sd"> 1 1.0 1.0</span>
<span class="sd"> 2 1.0 1.0</span>
<span class="sd"> &gt;&gt;&gt; def pandas_div(x) -&gt; ps.DataFrame[(&quot;index&quot;, int), [(&quot;f1&quot;, float), (&quot;f2&quot;, float)]]:</span>
<span class="sd"> ... return x[[&#39;B&#39;, &#39;C&#39;]] / x[[&#39;B&#39;, &#39;C&#39;]]</span>
<span class="sd"> &gt;&gt;&gt; g.apply(pandas_div).sort_index() # doctest: +SKIP</span>
<span class="sd"> f1 f2</span>
<span class="sd"> index</span>
<span class="sd"> 0 1.0 1.0</span>
<span class="sd"> 1 1.0 1.0</span>
<span class="sd"> 2 1.0 1.0</span>
<span class="sd"> In case of Series, it works as below.</span>
<span class="sd"> &gt;&gt;&gt; def plus_max(x) -&gt; ps.Series[int]:</span>
<span class="sd"> ... return x + x.max()</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).apply(plus_max).sort_index() # doctest: +SKIP</span>
<span class="sd"> 0 6</span>
<span class="sd"> 1 3</span>
<span class="sd"> 2 4</span>
<span class="sd"> Name: B, dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; def plus_min(x):</span>
<span class="sd"> ... return x + x.min()</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).apply(plus_min).sort_index() # doctest: +SKIP</span>
<span class="sd"> 0 2</span>
<span class="sd"> 1 3</span>
<span class="sd"> 2 6</span>
<span class="sd"> Name: B, dtype: int64</span>
<span class="sd"> You can also return a scalar value as an aggregated value of the group:</span>
<span class="sd"> &gt;&gt;&gt; def plus_length(x) -&gt; int:</span>
<span class="sd"> ... return len(x)</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).apply(plus_length).sort_index() # doctest: +SKIP</span>
<span class="sd"> 0 1</span>
<span class="sd"> 1 2</span>
<span class="sd"> Name: B, dtype: int64</span>
<span class="sd"> The extra arguments to the function can be passed as below.</span>
<span class="sd"> &gt;&gt;&gt; def calculation(x, y, z) -&gt; int:</span>
<span class="sd"> ... return len(x) + y * z</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).apply(calculation, 5, z=10).sort_index() # doctest: +SKIP</span>
<span class="sd"> 0 51</span>
<span class="sd"> 1 52</span>
<span class="sd"> Name: B, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">callable</span><span class="p">(</span><span class="n">func</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;</span><span class="si">%s</span><span class="s2"> object is not callable&quot;</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="n">spec</span> <span class="o">=</span> <span class="n">inspect</span><span class="o">.</span><span class="n">getfullargspec</span><span class="p">(</span><span class="n">func</span><span class="p">)</span>
<span class="n">return_sig</span> <span class="o">=</span> <span class="n">spec</span><span class="o">.</span><span class="n">annotations</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;return&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
<span class="n">should_infer_schema</span> <span class="o">=</span> <span class="n">return_sig</span> <span class="ow">is</span> <span class="kc">None</span>
<span class="n">should_retain_index</span> <span class="o">=</span> <span class="n">should_infer_schema</span>
<span class="n">is_series_groupby</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">SeriesGroupBy</span><span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span><span class="p">:</span>
<span class="n">agg_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_column_labels_to_exclude</span>
<span class="p">]</span>
<span class="n">psdf</span><span class="p">,</span> <span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">groupkey_names</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_prepare_group_map_apply</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">agg_columns</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span>
<span class="n">name</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span>
<span class="n">pandas_apply</span> <span class="o">=</span> <span class="n">_builtin_table</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">func</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">f</span> <span class="o">=</span> <span class="n">_builtin_table</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">func</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">pandas_apply</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="o">*</span><span class="n">a</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">k</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Any</span><span class="p">:</span>
<span class="k">return</span> <span class="n">f</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">),</span> <span class="o">*</span><span class="n">a</span><span class="p">,</span> <span class="o">**</span><span class="n">k</span><span class="p">)</span>
<span class="n">should_return_series</span> <span class="o">=</span> <span class="kc">False</span>
<span class="k">if</span> <span class="n">should_infer_schema</span><span class="p">:</span>
<span class="c1"># Here we execute with the first 1000 to get the return type.</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;If the type hints is not specified for `groupby.apply`, &quot;</span>
<span class="s2">&quot;it is expensive to infer the data type internally.&quot;</span>
<span class="p">)</span>
<span class="n">limit</span> <span class="o">=</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">&quot;compute.shortcut_limit&quot;</span><span class="p">)</span>
<span class="c1"># Ensure sampling rows &gt;= 2 to make sure apply&#39;s infer schema is accurate</span>
<span class="c1"># See related: https://github.com/pandas-dev/pandas/issues/46893</span>
<span class="n">sample_limit</span> <span class="o">=</span> <span class="n">limit</span> <span class="o">+</span> <span class="mi">1</span> <span class="k">if</span> <span class="n">limit</span> <span class="k">else</span> <span class="mi">2</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">sample_limit</span><span class="p">)</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span>
<span class="n">groupkeys</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">pdf</span><span class="p">[</span><span class="n">groupkey_name</span><span class="p">]</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">groupkey_name</span><span class="p">,</span> <span class="n">psser</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span>
<span class="p">]</span>
<span class="n">grouped</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">)</span>
<span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span>
<span class="n">pser_or_pdf</span> <span class="o">=</span> <span class="n">grouped</span><span class="p">[</span><span class="n">name</span><span class="p">]</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">pandas_apply</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">pser_or_pdf</span> <span class="o">=</span> <span class="n">grouped</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">pandas_apply</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="n">psser_or_psdf</span> <span class="o">=</span> <span class="n">ps</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span><span class="n">pser_or_pdf</span><span class="o">.</span><span class="n">infer_objects</span><span class="p">())</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="n">limit</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser_or_psdf</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="ow">and</span> <span class="n">is_series_groupby</span><span class="p">:</span>
<span class="n">psser_or_psdf</span> <span class="o">=</span> <span class="n">psser_or_psdf</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">SeriesGroupBy</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">],</span> <span class="n">psser_or_psdf</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">grouped</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">with</span> <span class="n">warnings</span><span class="o">.</span><span class="n">catch_warnings</span><span class="p">():</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">simplefilter</span><span class="p">(</span><span class="s2">&quot;always&quot;</span><span class="p">)</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;The amount of data for return type inference might not be large enough. &quot;</span>
<span class="s2">&quot;Consider increasing an option `compute.shortcut_limit`.&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser_or_psdf</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="n">should_return_series</span> <span class="o">=</span> <span class="kc">True</span>
<span class="n">psdf_from_pandas</span> <span class="o">=</span> <span class="n">psser_or_psdf</span><span class="o">.</span><span class="n">_psdf</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">psdf_from_pandas</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">psser_or_psdf</span><span class="p">)</span>
<span class="n">index_fields</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">field</span><span class="o">.</span><span class="n">normalize_spark_type</span><span class="p">()</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span>
<span class="p">]</span>
<span class="n">data_fields</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">field</span><span class="o">.</span><span class="n">normalize_spark_type</span><span class="p">()</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span>
<span class="p">]</span>
<span class="n">return_schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span><span class="n">field</span><span class="o">.</span><span class="n">struct_field</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">index_fields</span> <span class="o">+</span> <span class="n">data_fields</span><span class="p">])</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">return_type</span> <span class="o">=</span> <span class="n">infer_return_type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_series_groupby</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">return_type</span><span class="p">,</span> <span class="n">SeriesType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Series as a return type hint at frame groupby is not supported &quot;</span>
<span class="s2">&quot;currently; however got [</span><span class="si">%s</span><span class="s2">]. Use DataFrame type hint instead.&quot;</span> <span class="o">%</span> <span class="n">return_sig</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">return_type</span><span class="p">,</span> <span class="n">DataFrameType</span><span class="p">):</span>
<span class="n">data_fields</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">data_fields</span>
<span class="n">return_schema</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">spark_type</span>
<span class="n">index_fields</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">index_fields</span>
<span class="n">should_retain_index</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">index_fields</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span>
<span class="n">psdf_from_pandas</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">should_return_series</span> <span class="o">=</span> <span class="kc">True</span>
<span class="n">dtype</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">SeriesType</span><span class="p">,</span> <span class="n">ScalarType</span><span class="p">],</span> <span class="n">return_type</span><span class="p">)</span><span class="o">.</span><span class="n">dtype</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">SeriesType</span><span class="p">,</span> <span class="n">ScalarType</span><span class="p">],</span> <span class="n">return_type</span><span class="p">)</span><span class="o">.</span><span class="n">spark_type</span>
<span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span>
<span class="n">data_fields</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">InternalField</span><span class="p">(</span>
<span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">struct_field</span><span class="o">=</span><span class="n">StructField</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">,</span> <span class="n">dataType</span><span class="o">=</span><span class="n">spark_type</span><span class="p">)</span>
<span class="p">)</span>
<span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">data_fields</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">InternalField</span><span class="p">(</span>
<span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span>
<span class="n">struct_field</span><span class="o">=</span><span class="n">StructField</span><span class="p">(</span>
<span class="n">name</span><span class="o">=</span><span class="n">SPARK_DEFAULT_SERIES_NAME</span><span class="p">,</span> <span class="n">dataType</span><span class="o">=</span><span class="n">spark_type</span>
<span class="p">),</span>
<span class="p">)</span>
<span class="p">]</span>
<span class="n">return_schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span><span class="n">field</span><span class="o">.</span><span class="n">struct_field</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">data_fields</span><span class="p">])</span>
<span class="k">def</span> <span class="nf">pandas_groupby_apply</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span>
<span class="n">pdf_or_ser</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)[</span><span class="n">name</span><span class="p">]</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">pandas_apply</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">pdf_or_ser</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">pandas_apply</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">if</span> <span class="n">should_return_series</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pdf_or_ser</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="n">pdf_or_ser</span> <span class="o">=</span> <span class="n">pdf_or_ser</span><span class="o">.</span><span class="n">stack</span><span class="p">()</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pdf_or_ser</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">pdf_or_ser</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pdf_or_ser</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_group_map_apply</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="n">pandas_groupby_apply</span><span class="p">,</span>
<span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">],</span>
<span class="n">return_schema</span><span class="p">,</span>
<span class="n">retain_index</span><span class="o">=</span><span class="n">should_retain_index</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">should_retain_index</span><span class="p">:</span>
<span class="c1"># If schema is inferred, we can restore indexes too.</span>
<span class="k">if</span> <span class="n">psdf_from_pandas</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_fields</span><span class="o">=</span><span class="n">index_fields</span><span class="p">,</span> <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">index_names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Optional</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="o">...</span><span class="p">]]]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">index_spark_columns</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_field</span><span class="o">.</span><span class="n">struct_field</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">index_field</span> <span class="ow">in</span> <span class="n">index_fields</span>
<span class="p">]</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">any</span><span class="p">(</span>
<span class="p">[</span>
<span class="n">SPARK_INDEX_NAME_PATTERN</span><span class="o">.</span><span class="n">match</span><span class="p">(</span><span class="n">index_field</span><span class="o">.</span><span class="n">struct_field</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">index_field</span> <span class="ow">in</span> <span class="n">index_fields</span>
<span class="p">]</span>
<span class="p">):</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="p">[(</span><span class="n">index_field</span><span class="o">.</span><span class="n">struct_field</span><span class="o">.</span><span class="n">name</span><span class="p">,)</span> <span class="k">for</span> <span class="n">index_field</span> <span class="ow">in</span> <span class="n">index_fields</span><span class="p">]</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span>
<span class="n">index_fields</span><span class="o">=</span><span class="n">index_fields</span><span class="p">,</span>
<span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># Otherwise, it loses index.</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">should_return_series</span><span class="p">:</span>
<span class="n">psser</span> <span class="o">=</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span>
<span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span>
<span class="n">psser</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">SeriesGroupBy</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">return</span> <span class="n">psser</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div>
<span class="c1"># TODO: implement &#39;dropna&#39; parameter</span>
<div class="viewcode-block" id="GroupBy.filter"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.filter.html#pyspark.pandas.groupby.GroupBy.filter">[docs]</a> <span class="k">def</span> <span class="nf">filter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">FrameLike</span><span class="p">],</span> <span class="n">FrameLike</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return a copy of a DataFrame excluding elements from groups that</span>
<span class="sd"> do not satisfy the boolean criterion specified by func.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> f : function</span>
<span class="sd"> Function to apply to each subframe. Should return True or False.</span>
<span class="sd"> dropna : Drop groups that do not pass the filter. True by default;</span>
<span class="sd"> if False, groups that evaluate False are filled with NaNs.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> filtered : DataFrame or Series</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Each subframe is endowed the attribute &#39;name&#39; in case you need to know</span>
<span class="sd"> which group you are working on.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39; : [&#39;foo&#39;, &#39;bar&#39;, &#39;foo&#39;, &#39;bar&#39;,</span>
<span class="sd"> ... &#39;foo&#39;, &#39;bar&#39;],</span>
<span class="sd"> ... &#39;B&#39; : [1, 2, 3, 4, 5, 6],</span>
<span class="sd"> ... &#39;C&#39; : [2.0, 5., 8., 1., 2., 9.]}, columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;])</span>
<span class="sd"> &gt;&gt;&gt; grouped = df.groupby(&#39;A&#39;)</span>
<span class="sd"> &gt;&gt;&gt; grouped.filter(lambda x: x[&#39;B&#39;].mean() &gt; 3.)</span>
<span class="sd"> A B C</span>
<span class="sd"> 1 bar 2 5.0</span>
<span class="sd"> 3 bar 4 1.0</span>
<span class="sd"> 5 bar 6 9.0</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).filter(lambda x: x.mean() &gt; 3.)</span>
<span class="sd"> 1 2</span>
<span class="sd"> 3 4</span>
<span class="sd"> 5 6</span>
<span class="sd"> Name: B, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">callable</span><span class="p">(</span><span class="n">func</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;</span><span class="si">%s</span><span class="s2"> object is not callable&quot;</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="n">is_series_groupby</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">SeriesGroupBy</span><span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span><span class="p">:</span>
<span class="n">agg_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_column_labels_to_exclude</span>
<span class="p">]</span>
<span class="n">data_schema</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">psdf</span><span class="p">[</span><span class="n">agg_columns</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="o">*</span><span class="n">HIDDEN_COLUMNS</span><span class="p">)</span><span class="o">.</span><span class="n">schema</span>
<span class="p">)</span>
<span class="n">psdf</span><span class="p">,</span> <span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">groupkey_names</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_prepare_group_map_apply</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">agg_columns</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">pandas_filter</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)[</span><span class="n">pdf</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]]</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">func</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">f</span> <span class="o">=</span> <span class="n">_builtin_table</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">func</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">wrapped_func</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="k">return</span> <span class="n">f</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">pandas_filter</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">wrapped_func</span><span class="p">)</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_group_map_apply</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="n">pandas_filter</span><span class="p">,</span>
<span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">],</span>
<span class="n">data_schema</span><span class="p">,</span>
<span class="n">retain_index</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">[</span><span class="n">agg_columns</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">sdf</span><span class="p">))</span>
<span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">FrameLike</span><span class="p">,</span> <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">FrameLike</span><span class="p">,</span> <span class="n">psdf</span><span class="p">)</span></div>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_prepare_group_map_apply</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">groupkeys</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span> <span class="n">agg_columns</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">]</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]:</span>
<span class="n">groupkey_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="s2">&quot;__groupkey_</span><span class="si">{}</span><span class="s2">__&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">))</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">))</span>
<span class="p">]</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[[</span><span class="n">s</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_labels</span><span class="p">)]</span> <span class="o">+</span> <span class="n">agg_columns</span><span class="p">]</span>
<span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">label</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">1</span> <span class="k">else</span> <span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">]</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="p">),</span> <span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">groupkey_names</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_spark_group_map_apply</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span>
<span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span>
<span class="n">groupkeys_scols</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">],</span>
<span class="n">return_schema</span><span class="p">:</span> <span class="n">StructType</span><span class="p">,</span>
<span class="n">retain_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SparkDataFrame</span><span class="p">:</span>
<span class="n">output_func</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_make_pandas_df_builder_func</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">func</span><span class="p">,</span> <span class="n">return_schema</span><span class="p">,</span> <span class="n">retain_index</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="o">*</span><span class="n">HIDDEN_COLUMNS</span><span class="p">)</span>
<span class="k">return</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkeys_scols</span><span class="p">)</span><span class="o">.</span><span class="n">applyInPandas</span><span class="p">(</span><span class="n">output_func</span><span class="p">,</span> <span class="n">return_schema</span><span class="p">)</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_make_pandas_df_builder_func</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span>
<span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span>
<span class="n">return_schema</span><span class="p">:</span> <span class="n">StructType</span><span class="p">,</span>
<span class="n">retain_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Creates a function that can be used inside the pandas UDF. This function can construct</span>
<span class="sd"> the same pandas DataFrame as if the pandas-on-Spark DataFrame is collected to driver side.</span>
<span class="sd"> The index, column labels, etc. are re-constructed within the function.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.utils</span> <span class="kn">import</span> <span class="n">is_timestamp_ntz_preferred</span>
<span class="n">arguments_for_restore_index</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">arguments_for_restore_index</span>
<span class="n">prefer_timestamp_ntz</span> <span class="o">=</span> <span class="n">is_timestamp_ntz_preferred</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">rename_output</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="o">.</span><span class="n">restore_index</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">copy</span><span class="p">(),</span> <span class="o">**</span><span class="n">arguments_for_restore_index</span><span class="p">)</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">func</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span>
<span class="c1"># If schema should be inferred, we don&#39;t restore the index. pandas seems to restore</span>
<span class="c1"># the index in some cases.</span>
<span class="c1"># When Spark output type is specified, without executing it, we don&#39;t know</span>
<span class="c1"># if we should restore the index or not. For instance, see the example in</span>
<span class="c1"># https://github.com/databricks/koalas/issues/628.</span>
<span class="n">pdf</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="o">.</span><span class="n">prepare_pandas_frame</span><span class="p">(</span>
<span class="n">pdf</span><span class="p">,</span> <span class="n">retain_index</span><span class="o">=</span><span class="n">retain_index</span><span class="p">,</span> <span class="n">prefer_timestamp_ntz</span><span class="o">=</span><span class="n">prefer_timestamp_ntz</span>
<span class="p">)</span>
<span class="c1"># Just positionally map the column names to given schema&#39;s.</span>
<span class="n">pdf</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">return_schema</span><span class="o">.</span><span class="n">names</span>
<span class="k">return</span> <span class="n">pdf</span>
<span class="k">return</span> <span class="n">rename_output</span>
<div class="viewcode-block" id="GroupBy.rank"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.rank.html#pyspark.pandas.groupby.GroupBy.rank">[docs]</a> <span class="k">def</span> <span class="nf">rank</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">method</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;average&quot;</span><span class="p">,</span> <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Provide the rank of values within each group.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> method : {&#39;average&#39;, &#39;min&#39;, &#39;max&#39;, &#39;first&#39;, &#39;dense&#39;}, default &#39;average&#39;</span>
<span class="sd"> * average: average rank of group</span>
<span class="sd"> * min: lowest rank in group</span>
<span class="sd"> * max: highest rank in group</span>
<span class="sd"> * first: ranks assigned in order they appear in the array</span>
<span class="sd"> * dense: like &#39;min&#39;, but rank always increases by 1 between groups</span>
<span class="sd"> ascending : boolean, default True</span>
<span class="sd"> False for ranks by high (1) to low (N)</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame with ranking of values within each group</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({</span>
<span class="sd"> ... &#39;a&#39;: [1, 1, 1, 2, 2, 2, 3, 3, 3],</span>
<span class="sd"> ... &#39;b&#39;: [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> a b</span>
<span class="sd"> 0 1 1</span>
<span class="sd"> 1 1 2</span>
<span class="sd"> 2 1 2</span>
<span class="sd"> 3 2 2</span>
<span class="sd"> 4 2 3</span>
<span class="sd"> 5 2 3</span>
<span class="sd"> 6 3 3</span>
<span class="sd"> 7 3 4</span>
<span class="sd"> 8 3 4</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;a&quot;).rank().sort_index()</span>
<span class="sd"> b</span>
<span class="sd"> 0 1.0</span>
<span class="sd"> 1 2.5</span>
<span class="sd"> 2 2.5</span>
<span class="sd"> 3 1.0</span>
<span class="sd"> 4 2.5</span>
<span class="sd"> 5 2.5</span>
<span class="sd"> 6 1.0</span>
<span class="sd"> 7 2.5</span>
<span class="sd"> 8 2.5</span>
<span class="sd"> &gt;&gt;&gt; df.b.groupby(df.a).rank(method=&#39;max&#39;).sort_index()</span>
<span class="sd"> 0 1.0</span>
<span class="sd"> 1 3.0</span>
<span class="sd"> 2 3.0</span>
<span class="sd"> 3 1.0</span>
<span class="sd"> 4 3.0</span>
<span class="sd"> 5 3.0</span>
<span class="sd"> 6 1.0</span>
<span class="sd"> 7 3.0</span>
<span class="sd"> 8 3.0</span>
<span class="sd"> Name: b, dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_rank</span><span class="p">(</span><span class="n">method</span><span class="p">,</span> <span class="n">ascending</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span>
<span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.idxmax"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.idxmax.html#pyspark.pandas.groupby.GroupBy.idxmax">[docs]</a> <span class="k">def</span> <span class="nf">idxmax</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return index of first occurrence of maximum over requested axis in group.</span>
<span class="sd"> NA/null values are excluded.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> skipna : boolean, default True</span>
<span class="sd"> Exclude NA/null values. If an entire row/column is NA, the result will be NA.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.idxmax</span>
<span class="sd"> DataFrame.idxmax</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 1, 2, 2, 3],</span>
<span class="sd"> ... &#39;b&#39;: [1, 2, 3, 4, 5],</span>
<span class="sd"> ... &#39;c&#39;: [5, 4, 3, 2, 1]}, columns=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;a&#39;])[&#39;b&#39;].idxmax().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> a</span>
<span class="sd"> 1 1</span>
<span class="sd"> 2 3</span>
<span class="sd"> 3 4</span>
<span class="sd"> Name: b, dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;a&#39;]).idxmax().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> b c</span>
<span class="sd"> a</span>
<span class="sd"> 1 1 0</span>
<span class="sd"> 2 3 2</span>
<span class="sd"> 3 4 4</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;idxmax only support one-level index now&quot;</span><span class="p">)</span>
<span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;__groupkey_</span><span class="si">{}</span><span class="s2">__&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span>
<span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">):</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span>
<span class="n">index</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">stat_exprs</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">scol</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_scols</span><span class="p">):</span>
<span class="n">name</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">if</span> <span class="n">skipna</span><span class="p">:</span>
<span class="n">order_column</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">desc_nulls_last</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">order_column</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">desc_nulls_first</span><span class="p">()</span>
<span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span>
<span class="n">order_column</span><span class="p">,</span> <span class="n">NATURAL_ORDER_COLUMN_NAME</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span>
<span class="n">name</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">,</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index</span><span class="p">))</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span>
<span class="p">)</span>
<span class="n">stat_exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">name</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">))</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">stat_exprs</span><span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span>
<span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">],</span>
<span class="n">index_fields</span><span class="o">=</span><span class="p">[</span>
<span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span>
<span class="p">],</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span>
<span class="p">],</span>
<span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_handle_output</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div>
<div class="viewcode-block" id="GroupBy.idxmin"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.idxmin.html#pyspark.pandas.groupby.GroupBy.idxmin">[docs]</a> <span class="k">def</span> <span class="nf">idxmin</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return index of first occurrence of minimum over requested axis in group.</span>
<span class="sd"> NA/null values are excluded.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> skipna : boolean, default True</span>
<span class="sd"> Exclude NA/null values. If an entire row/column is NA, the result will be NA.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.idxmin</span>
<span class="sd"> DataFrame.idxmin</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 1, 2, 2, 3],</span>
<span class="sd"> ... &#39;b&#39;: [1, 2, 3, 4, 5],</span>
<span class="sd"> ... &#39;c&#39;: [5, 4, 3, 2, 1]}, columns=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;a&#39;])[&#39;b&#39;].idxmin().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> a</span>
<span class="sd"> 1 0</span>
<span class="sd"> 2 2</span>
<span class="sd"> 3 4</span>
<span class="sd"> Name: b, dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;a&#39;]).idxmin().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> b c</span>
<span class="sd"> a</span>
<span class="sd"> 1 0 1</span>
<span class="sd"> 2 2 3</span>
<span class="sd"> 3 4 4</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;idxmin only support one-level index now&quot;</span><span class="p">)</span>
<span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;__groupkey_</span><span class="si">{}</span><span class="s2">__&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span>
<span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">):</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span>
<span class="n">index</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">stat_exprs</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">scol</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_scols</span><span class="p">):</span>
<span class="n">name</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">if</span> <span class="n">skipna</span><span class="p">:</span>
<span class="n">order_column</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">asc_nulls_last</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">order_column</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">asc_nulls_first</span><span class="p">()</span>
<span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span>
<span class="n">order_column</span><span class="p">,</span> <span class="n">NATURAL_ORDER_COLUMN_NAME</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span>
<span class="n">name</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">,</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index</span><span class="p">))</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span>
<span class="p">)</span>
<span class="n">stat_exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">name</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">))</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">stat_exprs</span><span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span>
<span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">],</span>
<span class="n">index_fields</span><span class="o">=</span><span class="p">[</span>
<span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span>
<span class="p">],</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span>
<span class="p">],</span>
<span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_handle_output</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div>
<div class="viewcode-block" id="GroupBy.fillna"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.fillna.html#pyspark.pandas.groupby.GroupBy.fillna">[docs]</a> <span class="k">def</span> <span class="nf">fillna</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">method</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Fill NA/NaN values in group.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> value : scalar, dict, Series</span>
<span class="sd"> Value to use to fill holes. alternately a dict/Series of values</span>
<span class="sd"> specifying which value to use for each column.</span>
<span class="sd"> DataFrame is not supported.</span>
<span class="sd"> method : {&#39;backfill&#39;, &#39;bfill&#39;, &#39;pad&#39;, &#39;ffill&#39;, None}, default None</span>
<span class="sd"> Method to use for filling holes in reindexed Series pad / ffill: propagate last valid</span>
<span class="sd"> observation forward to next valid backfill / bfill:</span>
<span class="sd"> use NEXT valid observation to fill gap</span>
<span class="sd"> .. deprecated:: 4.0.0</span>
<span class="sd"> axis : {0 or `index`}</span>
<span class="sd"> 1 and `columns` are not supported.</span>
<span class="sd"> .. deprecated:: 4.0.0</span>
<span class="sd"> For axis=1, operate on the underlying object instead.</span>
<span class="sd"> Otherwise the axis keyword is not necessary.</span>
<span class="sd"> inplace : boolean, default False</span>
<span class="sd"> Fill in place (do not create a new object)</span>
<span class="sd"> limit : int, default None</span>
<span class="sd"> If method is specified, this is the maximum number of consecutive NaN values to</span>
<span class="sd"> forward/backward fill. In other words, if there is a gap with more than this number of</span>
<span class="sd"> consecutive NaNs, it will only be partially filled. If method is not specified,</span>
<span class="sd"> this is the maximum number of entries along the entire axis where NaNs will be filled.</span>
<span class="sd"> Must be greater than 0 if not None</span>
<span class="sd"> .. deprecated:: 4.0.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> DataFrame with NA entries filled.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({</span>
<span class="sd"> ... &#39;A&#39;: [1, 1, 2, 2],</span>
<span class="sd"> ... &#39;B&#39;: [2, 4, None, 3],</span>
<span class="sd"> ... &#39;C&#39;: [None, None, None, 1],</span>
<span class="sd"> ... &#39;D&#39;: [0, 1, 5, 4]</span>
<span class="sd"> ... },</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;, &#39;D&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B C D</span>
<span class="sd"> 0 1 2.0 NaN 0</span>
<span class="sd"> 1 1 4.0 NaN 1</span>
<span class="sd"> 2 2 NaN NaN 5</span>
<span class="sd"> 3 2 3.0 1.0 4</span>
<span class="sd"> We can also propagate non-null values forward or backward in group.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;A&#39;])[&#39;B&#39;].fillna(method=&#39;ffill&#39;).sort_index()</span>
<span class="sd"> 0 2.0</span>
<span class="sd"> 1 4.0</span>
<span class="sd"> 2 NaN</span>
<span class="sd"> 3 3.0</span>
<span class="sd"> Name: B, dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;A&#39;]).fillna(method=&#39;bfill&#39;).sort_index()</span>
<span class="sd"> B C D</span>
<span class="sd"> 0 2.0 NaN 0</span>
<span class="sd"> 1 4.0 NaN 1</span>
<span class="sd"> 2 3.0 1.0 5</span>
<span class="sd"> 3 3.0 1.0 4</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">should_resolve</span> <span class="o">=</span> <span class="n">method</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="k">if</span> <span class="n">should_resolve</span><span class="p">:</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;DataFrameGroupBy.fillna with &#39;method&#39; is deprecated &quot;</span>
<span class="s2">&quot;and will raise in a future version. &quot;</span>
<span class="s2">&quot;Use DataFrameGroupBy.ffill() or DataFrameGroupBy.bfill() instead.&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_fillna</span><span class="p">(</span>
<span class="n">value</span><span class="o">=</span><span class="n">value</span><span class="p">,</span> <span class="n">method</span><span class="o">=</span><span class="n">method</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span>
<span class="p">),</span>
<span class="n">should_resolve</span><span class="o">=</span><span class="n">should_resolve</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.bfill"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.bfill.html#pyspark.pandas.groupby.GroupBy.bfill">[docs]</a> <span class="k">def</span> <span class="nf">bfill</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Synonym for `DataFrame.fillna()` with ``method=`bfill```.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {0 or `index`}</span>
<span class="sd"> 1 and `columns` are not supported.</span>
<span class="sd"> inplace : boolean, default False</span>
<span class="sd"> Fill in place (do not create a new object)</span>
<span class="sd"> limit : int, default None</span>
<span class="sd"> If method is specified, this is the maximum number of consecutive NaN values to</span>
<span class="sd"> forward/backward fill. In other words, if there is a gap with more than this number of</span>
<span class="sd"> consecutive NaNs, it will only be partially filled. If method is not specified,</span>
<span class="sd"> this is the maximum number of entries along the entire axis where NaNs will be filled.</span>
<span class="sd"> Must be greater than 0 if not None</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> DataFrame with NA entries filled.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({</span>
<span class="sd"> ... &#39;A&#39;: [1, 1, 2, 2],</span>
<span class="sd"> ... &#39;B&#39;: [2, 4, None, 3],</span>
<span class="sd"> ... &#39;C&#39;: [None, None, None, 1],</span>
<span class="sd"> ... &#39;D&#39;: [0, 1, 5, 4]</span>
<span class="sd"> ... },</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;, &#39;D&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B C D</span>
<span class="sd"> 0 1 2.0 NaN 0</span>
<span class="sd"> 1 1 4.0 NaN 1</span>
<span class="sd"> 2 2 NaN NaN 5</span>
<span class="sd"> 3 2 3.0 1.0 4</span>
<span class="sd"> Propagate non-null values backward.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;A&#39;]).bfill().sort_index()</span>
<span class="sd"> B C D</span>
<span class="sd"> 0 2.0 NaN 0</span>
<span class="sd"> 1 4.0 NaN 1</span>
<span class="sd"> 2 3.0 1.0 5</span>
<span class="sd"> 3 3.0 1.0 4</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="n">method</span><span class="o">=</span><span class="s2">&quot;bfill&quot;</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.ffill"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.ffill.html#pyspark.pandas.groupby.GroupBy.ffill">[docs]</a> <span class="k">def</span> <span class="nf">ffill</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Synonym for `DataFrame.fillna()` with ``method=`ffill```.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {0 or `index`}</span>
<span class="sd"> 1 and `columns` are not supported.</span>
<span class="sd"> inplace : boolean, default False</span>
<span class="sd"> Fill in place (do not create a new object)</span>
<span class="sd"> limit : int, default None</span>
<span class="sd"> If method is specified, this is the maximum number of consecutive NaN values to</span>
<span class="sd"> forward/backward fill. In other words, if there is a gap with more than this number of</span>
<span class="sd"> consecutive NaNs, it will only be partially filled. If method is not specified,</span>
<span class="sd"> this is the maximum number of entries along the entire axis where NaNs will be filled.</span>
<span class="sd"> Must be greater than 0 if not None</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> DataFrame with NA entries filled.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({</span>
<span class="sd"> ... &#39;A&#39;: [1, 1, 2, 2],</span>
<span class="sd"> ... &#39;B&#39;: [2, 4, None, 3],</span>
<span class="sd"> ... &#39;C&#39;: [None, None, None, 1],</span>
<span class="sd"> ... &#39;D&#39;: [0, 1, 5, 4]</span>
<span class="sd"> ... },</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;, &#39;D&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B C D</span>
<span class="sd"> 0 1 2.0 NaN 0</span>
<span class="sd"> 1 1 4.0 NaN 1</span>
<span class="sd"> 2 2 NaN NaN 5</span>
<span class="sd"> 3 2 3.0 1.0 4</span>
<span class="sd"> Propagate non-null values forward.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;A&#39;]).ffill().sort_index()</span>
<span class="sd"> B C D</span>
<span class="sd"> 0 2.0 NaN 0</span>
<span class="sd"> 1 4.0 NaN 1</span>
<span class="sd"> 2 NaN NaN 5</span>
<span class="sd"> 3 3.0 1.0 4</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="n">method</span><span class="o">=</span><span class="s2">&quot;ffill&quot;</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_limit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">asc</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Private function for tail and head.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span><span class="p">:</span>
<span class="n">agg_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_column_labels_to_exclude</span>
<span class="p">]</span>
<span class="n">psdf</span><span class="p">,</span> <span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_prepare_group_map_apply</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span>
<span class="n">agg_columns</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">groupkey_scols</span> <span class="o">=</span> <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span>
<span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_scols</span><span class="p">)</span>
<span class="c1"># This part is handled differently depending on whether it is a tail or a head.</span>
<span class="n">ordered_window</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">window</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span><span class="o">.</span><span class="n">asc</span><span class="p">())</span>
<span class="k">if</span> <span class="n">asc</span>
<span class="k">else</span> <span class="n">window</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span><span class="o">.</span><span class="n">desc</span><span class="p">())</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">n</span> <span class="o">&gt;=</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">tmp_row_num_col</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">&quot;__row_number__&quot;</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">tmp_row_num_col</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">ordered_window</span><span class="p">))</span>
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">tmp_row_num_col</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="n">n</span><span class="p">)</span>
<span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">tmp_row_num_col</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># Pandas supports Groupby positional indexing since v1.4.0</span>
<span class="c1"># https://pandas.pydata.org/docs/whatsnew/v1.4.0.html#groupby-positional-indexing</span>
<span class="c1">#</span>
<span class="c1"># To support groupby positional indexing, we need add a `__tmp_lag__` column to help</span>
<span class="c1"># us filtering rows before the specified offset row.</span>
<span class="c1">#</span>
<span class="c1"># For example for the dataframe:</span>
<span class="c1"># &gt;&gt;&gt; df = ps.DataFrame([[&quot;g&quot;, &quot;g0&quot;],</span>
<span class="c1"># ... [&quot;g&quot;, &quot;g1&quot;],</span>
<span class="c1"># ... [&quot;g&quot;, &quot;g2&quot;],</span>
<span class="c1"># ... [&quot;g&quot;, &quot;g3&quot;],</span>
<span class="c1"># ... [&quot;h&quot;, &quot;h0&quot;],</span>
<span class="c1"># ... [&quot;h&quot;, &quot;h1&quot;]], columns=[&quot;A&quot;, &quot;B&quot;])</span>
<span class="c1"># &gt;&gt;&gt; df.groupby(&quot;A&quot;).head(-1)</span>
<span class="c1">#</span>
<span class="c1"># Below is a result to show the `__tmp_lag__` column for above df, the limit n is</span>
<span class="c1"># `-1`, the `__tmp_lag__` will be set to `0` in rows[:-1], and left will be set to</span>
<span class="c1"># `null`:</span>
<span class="c1">#</span>
<span class="c1"># &gt;&gt;&gt; sdf.withColumn(tmp_lag_col, F.lag(F.lit(0), -1).over(ordered_window))</span>
<span class="c1"># +-----------------+--------------+---+---+-----------------+-----------+</span>
<span class="c1"># |__index_level_0__|__groupkey_0__| A| B|__natural_order__|__tmp_lag__|</span>
<span class="c1"># +-----------------+--------------+---+---+-----------------+-----------+</span>
<span class="c1"># | 0| g| g| g0| 0| 0|</span>
<span class="c1"># | 1| g| g| g1| 8589934592| 0|</span>
<span class="c1"># | 2| g| g| g2| 17179869184| 0|</span>
<span class="c1"># | 3| g| g| g3| 25769803776| null|</span>
<span class="c1"># | 4| h| h| h0| 34359738368| 0|</span>
<span class="c1"># | 5| h| h| h1| 42949672960| null|</span>
<span class="c1"># +-----------------+--------------+---+---+-----------------+-----------+</span>
<span class="c1">#</span>
<span class="n">tmp_lag_col</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">&quot;__tmp_lag__&quot;</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">tmp_lag_col</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">lag</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> <span class="n">n</span><span class="p">)</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">ordered_window</span><span class="p">))</span>
<span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="o">~</span><span class="n">F</span><span class="o">.</span><span class="n">isnull</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">tmp_lag_col</span><span class="p">)))</span>
<span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">tmp_lag_col</span><span class="p">)</span>
<span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">sdf</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_handle_output</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">))</span>
<div class="viewcode-block" id="GroupBy.head"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.head.html#pyspark.pandas.groupby.GroupBy.head">[docs]</a> <span class="k">def</span> <span class="nf">head</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return first n rows of each group.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame or Series</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 1, 1, 1, 2, 2, 2, 3, 3, 3],</span>
<span class="sd"> ... &#39;b&#39;: [2, 3, 1, 4, 6, 9, 8, 10, 7, 5],</span>
<span class="sd"> ... &#39;c&#39;: [3, 5, 2, 5, 1, 2, 6, 4, 3, 6]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;],</span>
<span class="sd"> ... index=[7, 2, 4, 1, 3, 4, 9, 10, 5, 6])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> a b c</span>
<span class="sd"> 7 1 2 3</span>
<span class="sd"> 2 1 3 5</span>
<span class="sd"> 4 1 1 2</span>
<span class="sd"> 1 1 4 5</span>
<span class="sd"> 3 2 6 1</span>
<span class="sd"> 4 2 9 2</span>
<span class="sd"> 9 2 8 6</span>
<span class="sd"> 10 3 10 4</span>
<span class="sd"> 5 3 7 3</span>
<span class="sd"> 6 3 5 6</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;a&#39;).head(2).sort_index()</span>
<span class="sd"> a b c</span>
<span class="sd"> 2 1 3 5</span>
<span class="sd"> 3 2 6 1</span>
<span class="sd"> 4 2 9 2</span>
<span class="sd"> 5 3 7 3</span>
<span class="sd"> 7 1 2 3</span>
<span class="sd"> 10 3 10 4</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;a&#39;)[&#39;b&#39;].head(2).sort_index()</span>
<span class="sd"> 2 3</span>
<span class="sd"> 3 6</span>
<span class="sd"> 4 9</span>
<span class="sd"> 5 7</span>
<span class="sd"> 7 2</span>
<span class="sd"> 10 10</span>
<span class="sd"> Name: b, dtype: int64</span>
<span class="sd"> Supports Groupby positional indexing Since pandas on Spark 3.4 (with pandas 1.4+):</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame([[&quot;g&quot;, &quot;g0&quot;],</span>
<span class="sd"> ... [&quot;g&quot;, &quot;g1&quot;],</span>
<span class="sd"> ... [&quot;g&quot;, &quot;g2&quot;],</span>
<span class="sd"> ... [&quot;g&quot;, &quot;g3&quot;],</span>
<span class="sd"> ... [&quot;h&quot;, &quot;h0&quot;],</span>
<span class="sd"> ... [&quot;h&quot;, &quot;h1&quot;]], columns=[&quot;A&quot;, &quot;B&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).head(-1) # doctest: +SKIP</span>
<span class="sd"> A B</span>
<span class="sd"> 0 g g0</span>
<span class="sd"> 1 g g1</span>
<span class="sd"> 2 g g2</span>
<span class="sd"> 4 h h0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_limit</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="n">asc</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.tail"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.tail.html#pyspark.pandas.groupby.GroupBy.tail">[docs]</a> <span class="k">def</span> <span class="nf">tail</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return last n rows of each group.</span>
<span class="sd"> Similar to `.apply(lambda x: x.tail(n))`, but it returns a subset of rows from</span>
<span class="sd"> the original DataFrame with original index and order preserved (`as_index` flag is ignored).</span>
<span class="sd"> Does not work for negative values of n.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame or Series</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 1, 1, 1, 2, 2, 2, 3, 3, 3],</span>
<span class="sd"> ... &#39;b&#39;: [2, 3, 1, 4, 6, 9, 8, 10, 7, 5],</span>
<span class="sd"> ... &#39;c&#39;: [3, 5, 2, 5, 1, 2, 6, 4, 3, 6]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;],</span>
<span class="sd"> ... index=[7, 2, 3, 1, 3, 4, 9, 10, 5, 6])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> a b c</span>
<span class="sd"> 7 1 2 3</span>
<span class="sd"> 2 1 3 5</span>
<span class="sd"> 3 1 1 2</span>
<span class="sd"> 1 1 4 5</span>
<span class="sd"> 3 2 6 1</span>
<span class="sd"> 4 2 9 2</span>
<span class="sd"> 9 2 8 6</span>
<span class="sd"> 10 3 10 4</span>
<span class="sd"> 5 3 7 3</span>
<span class="sd"> 6 3 5 6</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;a&#39;).tail(2).sort_index()</span>
<span class="sd"> a b c</span>
<span class="sd"> 1 1 4 5</span>
<span class="sd"> 3 1 1 2</span>
<span class="sd"> 4 2 9 2</span>
<span class="sd"> 5 3 7 3</span>
<span class="sd"> 6 3 5 6</span>
<span class="sd"> 9 2 8 6</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;a&#39;)[&#39;b&#39;].tail(2).sort_index()</span>
<span class="sd"> 1 4</span>
<span class="sd"> 3 1</span>
<span class="sd"> 4 9</span>
<span class="sd"> 5 7</span>
<span class="sd"> 6 5</span>
<span class="sd"> 9 8</span>
<span class="sd"> Name: b, dtype: int64</span>
<span class="sd"> Supports Groupby positional indexing Since pandas on Spark 3.4 (with pandas 1.4+):</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame([[&quot;g&quot;, &quot;g0&quot;],</span>
<span class="sd"> ... [&quot;g&quot;, &quot;g1&quot;],</span>
<span class="sd"> ... [&quot;g&quot;, &quot;g2&quot;],</span>
<span class="sd"> ... [&quot;g&quot;, &quot;g3&quot;],</span>
<span class="sd"> ... [&quot;h&quot;, &quot;h0&quot;],</span>
<span class="sd"> ... [&quot;h&quot;, &quot;h1&quot;]], columns=[&quot;A&quot;, &quot;B&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).tail(-1) # doctest: +SKIP</span>
<span class="sd"> A B</span>
<span class="sd"> 3 g g3</span>
<span class="sd"> 2 g g2</span>
<span class="sd"> 1 g g1</span>
<span class="sd"> 5 h h1</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_limit</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="n">asc</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.shift"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.shift.html#pyspark.pandas.groupby.GroupBy.shift">[docs]</a> <span class="k">def</span> <span class="nf">shift</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">fill_value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Shift each group by periods observations.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> periods : integer, default 1</span>
<span class="sd"> number of periods to shift</span>
<span class="sd"> fill_value : optional</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or DataFrame</span>
<span class="sd"> Object shifted within each group.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({</span>
<span class="sd"> ... &#39;a&#39;: [1, 1, 1, 2, 2, 2, 3, 3, 3],</span>
<span class="sd"> ... &#39;b&#39;: [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> a b</span>
<span class="sd"> 0 1 1</span>
<span class="sd"> 1 1 2</span>
<span class="sd"> 2 1 2</span>
<span class="sd"> 3 2 2</span>
<span class="sd"> 4 2 3</span>
<span class="sd"> 5 2 3</span>
<span class="sd"> 6 3 3</span>
<span class="sd"> 7 3 4</span>
<span class="sd"> 8 3 4</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;a&#39;).shift().sort_index() # doctest: +SKIP</span>
<span class="sd"> b</span>
<span class="sd"> 0 NaN</span>
<span class="sd"> 1 1.0</span>
<span class="sd"> 2 2.0</span>
<span class="sd"> 3 NaN</span>
<span class="sd"> 4 2.0</span>
<span class="sd"> 5 3.0</span>
<span class="sd"> 6 NaN</span>
<span class="sd"> 7 3.0</span>
<span class="sd"> 8 4.0</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;a&#39;).shift(periods=-1, fill_value=0).sort_index() # doctest: +SKIP</span>
<span class="sd"> b</span>
<span class="sd"> 0 2</span>
<span class="sd"> 1 2</span>
<span class="sd"> 2 0</span>
<span class="sd"> 3 3</span>
<span class="sd"> 4 3</span>
<span class="sd"> 5 0</span>
<span class="sd"> 6 4</span>
<span class="sd"> 7 4</span>
<span class="sd"> 8 0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_shift</span><span class="p">(</span><span class="n">periods</span><span class="p">,</span> <span class="n">fill_value</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span>
<span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.transform"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.transform.html#pyspark.pandas.groupby.GroupBy.transform">[docs]</a> <span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">],</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Apply function column-by-column to the GroupBy object.</span>
<span class="sd"> The function passed to `transform` must take a Series as its first</span>
<span class="sd"> argument and return a Series. The given function is executed for</span>
<span class="sd"> each series in each grouped data.</span>
<span class="sd"> While `transform` is a very flexible method, its downside is that</span>
<span class="sd"> using it can be quite a bit slower than using more specific methods</span>
<span class="sd"> like `agg` or `transform`. pandas-on-Spark offers a wide range of method that will</span>
<span class="sd"> be much faster than using `transform` for their specific purposes, so try to</span>
<span class="sd"> use them before reaching for `transform`.</span>
<span class="sd"> .. note:: this API executes the function once to infer the type which is</span>
<span class="sd"> potentially expensive, for instance, when the dataset is created after</span>
<span class="sd"> aggregations or sorting.</span>
<span class="sd"> To avoid this, specify return type in ``func``, for instance, as below:</span>
<span class="sd"> &gt;&gt;&gt; def convert_to_string(x) -&gt; ps.Series[str]:</span>
<span class="sd"> ... return x.apply(&quot;a string {}&quot;.format)</span>
<span class="sd"> When the given function has the return type annotated, the original index of the</span>
<span class="sd"> GroupBy object will be lost, and a default index will be attached to the result.</span>
<span class="sd"> Please be careful about configuring the default index. See also `Default Index Type</span>
<span class="sd"> &lt;https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type&gt;`_.</span>
<span class="sd"> .. note:: the series within ``func`` is actually a pandas series. Therefore,</span>
<span class="sd"> any pandas API within this function is allowed.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> func : callable</span>
<span class="sd"> A callable that takes a Series as its first argument, and</span>
<span class="sd"> returns a Series.</span>
<span class="sd"> *args</span>
<span class="sd"> Positional arguments to pass to func.</span>
<span class="sd"> **kwargs</span>
<span class="sd"> Keyword arguments to pass to func.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> applied : DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> aggregate : Apply aggregate function to the GroupBy object.</span>
<span class="sd"> Series.apply : Apply a function to a Series.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [0, 0, 1],</span>
<span class="sd"> ... &#39;B&#39;: [1, 2, 3],</span>
<span class="sd"> ... &#39;C&#39;: [4, 6, 5]}, columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;])</span>
<span class="sd"> &gt;&gt;&gt; g = df.groupby(&#39;A&#39;)</span>
<span class="sd"> Notice that ``g`` has two groups, ``0`` and ``1``.</span>
<span class="sd"> Calling `transform` in various ways, we can get different grouping results:</span>
<span class="sd"> Below the functions passed to `transform` takes a Series as</span>
<span class="sd"> its argument and returns a Series. `transform` applies the function on each series</span>
<span class="sd"> in each grouped data, and combine them into a new DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; def convert_to_string(x) -&gt; ps.Series[str]:</span>
<span class="sd"> ... return x.apply(&quot;a string {}&quot;.format)</span>
<span class="sd"> &gt;&gt;&gt; g.transform(convert_to_string) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B C</span>
<span class="sd"> 0 a string 1 a string 4</span>
<span class="sd"> 1 a string 2 a string 6</span>
<span class="sd"> 2 a string 3 a string 5</span>
<span class="sd"> &gt;&gt;&gt; def plus_max(x) -&gt; ps.Series[int]:</span>
<span class="sd"> ... return x + x.max()</span>
<span class="sd"> &gt;&gt;&gt; g.transform(plus_max) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B C</span>
<span class="sd"> 0 3 10</span>
<span class="sd"> 1 4 12</span>
<span class="sd"> 2 6 10</span>
<span class="sd"> You can omit the type hint and let pandas-on-Spark infer its type.</span>
<span class="sd"> &gt;&gt;&gt; def plus_min(x):</span>
<span class="sd"> ... return x + x.min()</span>
<span class="sd"> &gt;&gt;&gt; g.transform(plus_min) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B C</span>
<span class="sd"> 0 2 8</span>
<span class="sd"> 1 3 10</span>
<span class="sd"> 2 6 10</span>
<span class="sd"> In case of Series, it works as below.</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).transform(plus_max)</span>
<span class="sd"> 0 3</span>
<span class="sd"> 1 4</span>
<span class="sd"> 2 6</span>
<span class="sd"> Name: B, dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; (df * -1).B.groupby(df.A).transform(abs)</span>
<span class="sd"> 0 1</span>
<span class="sd"> 1 2</span>
<span class="sd"> 2 3</span>
<span class="sd"> Name: B, dtype: int64</span>
<span class="sd"> You can also specify extra arguments to pass to the function.</span>
<span class="sd"> &gt;&gt;&gt; def calculation(x, y, z) -&gt; ps.Series[int]:</span>
<span class="sd"> ... return x + x.min() + y + z</span>
<span class="sd"> &gt;&gt;&gt; g.transform(calculation, 5, z=20) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B C</span>
<span class="sd"> 0 27 33</span>
<span class="sd"> 1 28 35</span>
<span class="sd"> 2 31 35</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">callable</span><span class="p">(</span><span class="n">func</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;</span><span class="si">%s</span><span class="s2"> object is not callable&quot;</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="n">spec</span> <span class="o">=</span> <span class="n">inspect</span><span class="o">.</span><span class="n">getfullargspec</span><span class="p">(</span><span class="n">func</span><span class="p">)</span>
<span class="n">return_sig</span> <span class="o">=</span> <span class="n">spec</span><span class="o">.</span><span class="n">annotations</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;return&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
<span class="n">psdf</span><span class="p">,</span> <span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">groupkey_names</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_prepare_group_map_apply</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">agg_columns</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">pandas_transform</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="n">should_infer_schema</span> <span class="o">=</span> <span class="n">return_sig</span> <span class="ow">is</span> <span class="kc">None</span>
<span class="k">if</span> <span class="n">should_infer_schema</span><span class="p">:</span>
<span class="c1"># Here we execute with the first 1000 to get the return type.</span>
<span class="c1"># If the records were less than 1000, it uses pandas API directly for a shortcut.</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;If the type hints is not specified for `groupby.transform`, &quot;</span>
<span class="s2">&quot;it is expensive to infer the data type internally.&quot;</span>
<span class="p">)</span>
<span class="n">limit</span> <span class="o">=</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">&quot;compute.shortcut_limit&quot;</span><span class="p">)</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">limit</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="n">psdf_from_pandas</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span>
<span class="n">return_schema</span> <span class="o">=</span> <span class="n">force_decimal_precision_scale</span><span class="p">(</span>
<span class="n">as_nullable_spark_type</span><span class="p">(</span>
<span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="o">*</span><span class="n">HIDDEN_COLUMNS</span><span class="p">)</span><span class="o">.</span><span class="n">schema</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="n">limit</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_handle_output</span><span class="p">(</span><span class="n">psdf_from_pandas</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_group_map_apply</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="n">pandas_transform</span><span class="p">,</span>
<span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">],</span>
<span class="n">return_schema</span><span class="p">,</span>
<span class="n">retain_index</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span>
<span class="c1"># If schema is inferred, we can restore indexes too.</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span>
<span class="n">sdf</span><span class="p">,</span>
<span class="n">index_fields</span><span class="o">=</span><span class="p">[</span>
<span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">nullable</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span>
<span class="p">],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="p">[</span>
<span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">nullable</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span>
<span class="p">],</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">return_type</span> <span class="o">=</span> <span class="n">infer_return_type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">return_type</span><span class="p">,</span> <span class="n">SeriesType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Expected the return type of this function to be of Series type, &quot;</span>
<span class="s2">&quot;but found type </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">return_type</span><span class="p">)</span>
<span class="p">)</span>
<span class="n">dtype</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">dtype</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">spark_type</span>
<span class="n">data_fields</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">InternalField</span><span class="p">(</span><span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">struct_field</span><span class="o">=</span><span class="n">StructField</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">c</span><span class="p">,</span> <span class="n">dataType</span><span class="o">=</span><span class="n">spark_type</span><span class="p">))</span>
<span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span>
<span class="k">if</span> <span class="n">c</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">groupkey_names</span>
<span class="p">]</span>
<span class="n">return_schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span><span class="n">field</span><span class="o">.</span><span class="n">struct_field</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">data_fields</span><span class="p">])</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_group_map_apply</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="n">pandas_transform</span><span class="p">,</span>
<span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">],</span>
<span class="n">return_schema</span><span class="p">,</span>
<span class="n">retain_index</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="p">)</span>
<span class="c1"># Otherwise, it loses index.</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span>
<span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_handle_output</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div>
<div class="viewcode-block" id="GroupBy.nunique"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.nunique.html#pyspark.pandas.groupby.GroupBy.nunique">[docs]</a> <span class="k">def</span> <span class="nf">nunique</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return DataFrame with number of distinct observations per group for each column.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> dropna : boolean, default True</span>
<span class="sd"> Don’t include NaN in the counts.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> nunique : DataFrame or Series</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;id&#39;: [&#39;spam&#39;, &#39;egg&#39;, &#39;egg&#39;, &#39;spam&#39;,</span>
<span class="sd"> ... &#39;ham&#39;, &#39;ham&#39;],</span>
<span class="sd"> ... &#39;value1&#39;: [1, 5, 5, 2, 5, 5],</span>
<span class="sd"> ... &#39;value2&#39;: list(&#39;abbaxy&#39;)}, columns=[&#39;id&#39;, &#39;value1&#39;, &#39;value2&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> id value1 value2</span>
<span class="sd"> 0 spam 1 a</span>
<span class="sd"> 1 egg 5 b</span>
<span class="sd"> 2 egg 5 b</span>
<span class="sd"> 3 spam 2 a</span>
<span class="sd"> 4 ham 5 x</span>
<span class="sd"> 5 ham 5 y</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;id&#39;).nunique().sort_index() # doctest: +SKIP</span>
<span class="sd"> value1 value2</span>
<span class="sd"> id</span>
<span class="sd"> egg 1 1</span>
<span class="sd"> ham 1 2</span>
<span class="sd"> spam 2 1</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;id&#39;)[&#39;value1&#39;].nunique().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> id</span>
<span class="sd"> egg 1</span>
<span class="sd"> ham 1</span>
<span class="sd"> spam 2</span>
<span class="sd"> Name: value1, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">dropna</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">stat_function</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">countDistinct</span><span class="p">(</span><span class="n">col</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">stat_function</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">countDistinct</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="o">+</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="kc">None</span><span class="p">))</span> <span class="o">&gt;=</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span>
<span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">stat_function</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">rolling</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">window</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">min_periods</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RollingGroupby[FrameLike]&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return an rolling grouper, providing rolling</span>
<span class="sd"> functionality per group.</span>
<span class="sd"> .. note:: &#39;min_periods&#39; in pandas-on-Spark works as a fixed window size unlike pandas.</span>
<span class="sd"> Unlike pandas, NA is also counted as the period. This might be changed</span>
<span class="sd"> soon.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> window : int, or offset</span>
<span class="sd"> Size of the moving window.</span>
<span class="sd"> This is the number of observations used for calculating the statistic.</span>
<span class="sd"> Each window will be a fixed size.</span>
<span class="sd"> min_periods : int, default 1</span>
<span class="sd"> Minimum number of observations in window required to have a value</span>
<span class="sd"> (otherwise result is NA).</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.groupby</span>
<span class="sd"> DataFrame.groupby</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">RollingGroupby</span>
<span class="k">return</span> <span class="n">RollingGroupby</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">window</span><span class="p">,</span> <span class="n">min_periods</span><span class="o">=</span><span class="n">min_periods</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">expanding</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">min_periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;ExpandingGroupby[FrameLike]&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return an expanding grouper, providing expanding</span>
<span class="sd"> functionality per group.</span>
<span class="sd"> .. note:: &#39;min_periods&#39; in pandas-on-Spark works as a fixed window size unlike pandas.</span>
<span class="sd"> Unlike pandas, NA is also counted as the period. This might be changed</span>
<span class="sd"> soon.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> min_periods : int, default 1</span>
<span class="sd"> Minimum number of observations in window required to have a value</span>
<span class="sd"> (otherwise result is NA).</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.groupby</span>
<span class="sd"> DataFrame.groupby</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">ExpandingGroupby</span>
<span class="k">return</span> <span class="n">ExpandingGroupby</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">min_periods</span><span class="o">=</span><span class="n">min_periods</span><span class="p">)</span>
<span class="c1"># TODO: &#39;adjust&#39;, &#39;axis&#39;, &#39;method&#39; parameter should be implemented.</span>
<div class="viewcode-block" id="GroupBy.ewm"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.ewm.html#pyspark.pandas.groupby.GroupBy.ewm">[docs]</a> <span class="k">def</span> <span class="nf">ewm</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">com</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">span</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">halflife</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">alpha</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">min_periods</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">ignore_na</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;ExponentialMovingGroupby[FrameLike]&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return an ewm grouper, providing ewm functionality per group.</span>
<span class="sd"> .. note:: &#39;min_periods&#39; in pandas-on-Spark works as a fixed window size unlike pandas.</span>
<span class="sd"> Unlike pandas, NA is also counted as the period. This might be changed</span>
<span class="sd"> soon.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> com : float, optional</span>
<span class="sd"> Specify decay in terms of center of mass.</span>
<span class="sd"> alpha = 1 / (1 + com), for com &gt;= 0.</span>
<span class="sd"> span : float, optional</span>
<span class="sd"> Specify decay in terms of span.</span>
<span class="sd"> alpha = 2 / (span + 1), for span &gt;= 1.</span>
<span class="sd"> halflife : float, optional</span>
<span class="sd"> Specify decay in terms of half-life.</span>
<span class="sd"> alpha = 1 - exp(-ln(2) / halflife), for halflife &gt; 0.</span>
<span class="sd"> alpha : float, optional</span>
<span class="sd"> Specify smoothing factor alpha directly.</span>
<span class="sd"> 0 &lt; alpha &lt;= 1.</span>
<span class="sd"> min_periods : int, default None</span>
<span class="sd"> Minimum number of observations in window required to have a value</span>
<span class="sd"> (otherwise result is NA).</span>
<span class="sd"> ignore_na : bool, default False</span>
<span class="sd"> Ignore missing values when calculating weights.</span>
<span class="sd"> - When ``ignore_na=False`` (default), weights are based on absolute positions.</span>
<span class="sd"> For example, the weights of :math:`x_0` and :math:`x_2` used in calculating</span>
<span class="sd"> the final weighted average of [:math:`x_0`, None, :math:`x_2`] are</span>
<span class="sd"> :math:`(1-\alpha)^2` and :math:`1` if ``adjust=True``, and</span>
<span class="sd"> :math:`(1-\alpha)^2` and :math:`\alpha` if ``adjust=False``.</span>
<span class="sd"> - When ``ignore_na=True``, weights are based</span>
<span class="sd"> on relative positions. For example, the weights of :math:`x_0` and :math:`x_2`</span>
<span class="sd"> used in calculating the final weighted average of</span>
<span class="sd"> [:math:`x_0`, None, :math:`x_2`] are :math:`1-\alpha` and :math:`1` if</span>
<span class="sd"> ``adjust=True``, and :math:`1-\alpha` and :math:`\alpha` if ``adjust=False``.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">ExponentialMovingGroupby</span>
<span class="k">return</span> <span class="n">ExponentialMovingGroupby</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">com</span><span class="o">=</span><span class="n">com</span><span class="p">,</span>
<span class="n">span</span><span class="o">=</span><span class="n">span</span><span class="p">,</span>
<span class="n">halflife</span><span class="o">=</span><span class="n">halflife</span><span class="p">,</span>
<span class="n">alpha</span><span class="o">=</span><span class="n">alpha</span><span class="p">,</span>
<span class="n">min_periods</span><span class="o">=</span><span class="n">min_periods</span><span class="p">,</span>
<span class="n">ignore_na</span><span class="o">=</span><span class="n">ignore_na</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.get_group"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.get_group.html#pyspark.pandas.groupby.GroupBy.get_group">[docs]</a> <span class="k">def</span> <span class="nf">get_group</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Construct DataFrame from group with provided name.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> name : object</span>
<span class="sd"> The name of the group to get as a DataFrame.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> group : same type as obj</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame([(&#39;falcon&#39;, &#39;bird&#39;, 389.0),</span>
<span class="sd"> ... (&#39;parrot&#39;, &#39;bird&#39;, 24.0),</span>
<span class="sd"> ... (&#39;lion&#39;, &#39;mammal&#39;, 80.5),</span>
<span class="sd"> ... (&#39;monkey&#39;, &#39;mammal&#39;, np.nan)],</span>
<span class="sd"> ... columns=[&#39;name&#39;, &#39;class&#39;, &#39;max_speed&#39;],</span>
<span class="sd"> ... index=[0, 2, 3, 1])</span>
<span class="sd"> &gt;&gt;&gt; psdf</span>
<span class="sd"> name class max_speed</span>
<span class="sd"> 0 falcon bird 389.0</span>
<span class="sd"> 2 parrot bird 24.0</span>
<span class="sd"> 3 lion mammal 80.5</span>
<span class="sd"> 1 monkey mammal NaN</span>
<span class="sd"> &gt;&gt;&gt; psdf.groupby(&quot;class&quot;).get_group(&quot;bird&quot;).sort_index()</span>
<span class="sd"> name class max_speed</span>
<span class="sd"> 0 falcon bird 389.0</span>
<span class="sd"> 2 parrot bird 24.0</span>
<span class="sd"> &gt;&gt;&gt; psdf.groupby(&quot;class&quot;).get_group(&quot;mammal&quot;).sort_index()</span>
<span class="sd"> name class max_speed</span>
<span class="sd"> 1 monkey mammal NaN</span>
<span class="sd"> 3 lion mammal 80.5</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">groupkeys</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_hashable</span><span class="p">(</span><span class="n">name</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;unhashable type: &#39;</span><span class="si">{}</span><span class="s2">&#39;&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">name</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span>
<span class="k">elif</span> <span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;must supply a tuple to get_group with multiple grouping keys&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">name</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;must supply a same-length tuple to get_group with multiple grouping keys&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">name</span><span class="p">):</span>
<span class="n">name</span> <span class="o">=</span> <span class="p">[</span><span class="n">name</span><span class="p">]</span>
<span class="n">cond</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">True</span><span class="p">)</span>
<span class="k">for</span> <span class="n">groupkey</span><span class="p">,</span> <span class="n">item</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">name</span><span class="p">):</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">groupkey</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="n">cond</span> <span class="o">=</span> <span class="n">cond</span> <span class="o">&amp;</span> <span class="p">(</span><span class="n">scol</span> <span class="o">==</span> <span class="n">item</span><span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span><span class="p">:</span>
<span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span>
<span class="n">spark_frame</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="n">internal</span><span class="o">.</span><span class="n">index_spark_columns</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_scols</span>
<span class="p">)</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">spark_frame</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">spark_frame</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_column_names</span>
<span class="p">],</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">spark_frame</span><span class="p">,</span> <span class="n">s</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span>
<span class="p">],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">],</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_filter</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span>
<span class="k">if</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">head</span><span class="p">()</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">name</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_handle_output</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div>
<div class="viewcode-block" id="GroupBy.median"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.median.html#pyspark.pandas.groupby.GroupBy.median">[docs]</a> <span class="k">def</span> <span class="nf">median</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10000</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute median of groups, excluding missing values.</span>
<span class="sd"> For multiple groupings, the result index will be a MultiIndex</span>
<span class="sd"> .. note:: Unlike pandas&#39;, the median in pandas-on-Spark is an approximated median based upon</span>
<span class="sd"> approximate percentile computation because computing median across a large dataset</span>
<span class="sd"> is extremely expensive.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> numeric_only : bool, default False</span>
<span class="sd"> Include only float, int, boolean columns.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> .. versionchanged:: 4.0.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or DataFrame</span>
<span class="sd"> Median of values within each group.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame({&#39;a&#39;: [1., 1., 1., 1., 2., 2., 2., 3., 3., 3.],</span>
<span class="sd"> ... &#39;b&#39;: [2., 3., 1., 4., 6., 9., 8., 10., 7., 5.],</span>
<span class="sd"> ... &#39;c&#39;: [3., 5., 2., 5., 1., 2., 6., 4., 3., 6.]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;],</span>
<span class="sd"> ... index=[7, 2, 4, 1, 3, 4, 9, 10, 5, 6])</span>
<span class="sd"> &gt;&gt;&gt; psdf</span>
<span class="sd"> a b c</span>
<span class="sd"> 7 1.0 2.0 3.0</span>
<span class="sd"> 2 1.0 3.0 5.0</span>
<span class="sd"> 4 1.0 1.0 2.0</span>
<span class="sd"> 1 1.0 4.0 5.0</span>
<span class="sd"> 3 2.0 6.0 1.0</span>
<span class="sd"> 4 2.0 9.0 2.0</span>
<span class="sd"> 9 2.0 8.0 6.0</span>
<span class="sd"> 10 3.0 10.0 4.0</span>
<span class="sd"> 5 3.0 7.0 3.0</span>
<span class="sd"> 6 3.0 5.0 6.0</span>
<span class="sd"> DataFrameGroupBy</span>
<span class="sd"> &gt;&gt;&gt; psdf.groupby(&#39;a&#39;).median().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> b c</span>
<span class="sd"> a</span>
<span class="sd"> 1.0 2.0 3.0</span>
<span class="sd"> 2.0 8.0 2.0</span>
<span class="sd"> 3.0 7.0 4.0</span>
<span class="sd"> SeriesGroupBy</span>
<span class="sd"> &gt;&gt;&gt; psdf.groupby(&#39;a&#39;)[&#39;b&#39;].median().sort_index()</span>
<span class="sd"> a</span>
<span class="sd"> 1.0 2.0</span>
<span class="sd"> 2.0 8.0</span>
<span class="sd"> 3.0 7.0</span>
<span class="sd"> Name: b, dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">accuracy</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;accuracy must be an integer; however, got [</span><span class="si">%s</span><span class="s2">]&quot;</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">accuracy</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_validate_agg_columns</span><span class="p">(</span><span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="n">function_name</span><span class="o">=</span><span class="s2">&quot;median&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">stat_function</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">percentile_approx</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="mf">0.5</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">stat_function</span><span class="p">,</span>
<span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,),</span>
<span class="n">bool_to_numeric</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_validate_agg_columns</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">],</span> <span class="n">function_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Validate aggregation columns and raise an error or a warning following pandas.&quot;&quot;&quot;</span>
<span class="n">has_non_numeric</span> <span class="o">=</span> <span class="kc">False</span>
<span class="k">for</span> <span class="n">_agg_col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">:</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">_agg_col</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">)):</span>
<span class="n">has_non_numeric</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">break</span>
<span class="k">if</span> <span class="n">has_non_numeric</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">SeriesGroupBy</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;Only numeric aggregation column is accepted.&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">numeric_only</span> <span class="ow">and</span> <span class="n">has_non_numeric</span><span class="p">:</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;Dropping invalid columns in DataFrameGroupBy.</span><span class="si">%s</span><span class="s2"> is deprecated. &quot;</span>
<span class="s2">&quot;In a future version, a TypeError will be raised. &quot;</span>
<span class="s2">&quot;Before calling .</span><span class="si">%s</span><span class="s2">, select only columns which should be &quot;</span>
<span class="s2">&quot;valid for the function.&quot;</span> <span class="o">%</span> <span class="p">(</span><span class="n">function_name</span><span class="p">,</span> <span class="n">function_name</span><span class="p">),</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">_reduce_for_stat_function</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">sfun</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span>
<span class="n">accepted_spark_types</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Type</span><span class="p">[</span><span class="n">DataType</span><span class="p">],</span> <span class="o">...</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">bool_to_numeric</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Apply an aggregate function `sfun` per column and reduce to a FrameLike.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> sfun : The aggregate function to apply per column.</span>
<span class="sd"> accepted_spark_types: Accepted spark types of columns to be aggregated;</span>
<span class="sd"> default None means all spark types are accepted.</span>
<span class="sd"> bool_to_numeric: If True, boolean columns are converted to numeric columns, which</span>
<span class="sd"> are accepted for all statistical functions regardless of</span>
<span class="sd"> `accepted_spark_types`.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span>
<span class="n">internal</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prepare_reduce</span><span class="p">(</span>
<span class="n">groupkey_names</span><span class="p">,</span> <span class="n">accepted_spark_types</span><span class="p">,</span> <span class="n">bool_to_numeric</span>
<span class="p">)</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">min_count</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;min_count&quot;</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span>
<span class="n">stat_exprs</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span>
<span class="n">psser</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="n">input_scol</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">nan_to_null</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="k">if</span> <span class="n">sfun</span><span class="o">.</span><span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;sum&quot;</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_type_for</span><span class="p">(</span><span class="n">label</span><span class="p">),</span> <span class="n">StringType</span>
<span class="p">):</span>
<span class="n">input_scol_name</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="c1"># Sort data with natural order column to ensure order of data</span>
<span class="n">sorted_array</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">array_sort</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">collect_list</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">struct</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span> <span class="n">input_scol</span><span class="p">))</span>
<span class="p">)</span>
<span class="c1"># Using transform to extract strings</span>
<span class="n">output_scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">concat_ws</span><span class="p">(</span>
<span class="s2">&quot;&quot;</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">sorted_array</span><span class="p">,</span> <span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">getField</span><span class="p">(</span><span class="n">input_scol_name</span><span class="p">))</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">output_scol</span> <span class="o">=</span> <span class="n">sfun</span><span class="p">(</span><span class="n">input_scol</span><span class="p">)</span>
<span class="k">if</span> <span class="n">min_count</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">output_scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="o">~</span><span class="n">F</span><span class="o">.</span><span class="n">isnull</span><span class="p">(</span><span class="n">input_scol</span><span class="p">),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)))</span> <span class="o">&gt;=</span> <span class="n">min_count</span><span class="p">,</span> <span class="n">output_scol</span>
<span class="p">)</span>
<span class="n">stat_exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">output_scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]))</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">stat_exprs</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">distinct</span><span class="p">()</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prepare_return</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_prepare_return</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">agg_column_names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span><span class="p">:</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span>
<span class="n">subset</span><span class="o">=</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span><span class="p">:</span>
<span class="n">column_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">column</span><span class="o">.</span><span class="n">name</span> <span class="k">for</span> <span class="n">column</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">]</span>
<span class="k">for</span> <span class="n">groupkey</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">:</span>
<span class="k">if</span> <span class="n">groupkey</span><span class="o">.</span><span class="n">name</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">column_names</span><span class="p">:</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;A grouping was used that is not in the columns of the DataFrame and so &quot;</span>
<span class="s2">&quot;was excluded from the result. &quot;</span>
<span class="s2">&quot;This grouping will be included in a future version. &quot;</span>
<span class="s2">&quot;Add the grouping as a column of the DataFrame to silence this warning.&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">should_drop_index</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span>
<span class="n">i</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">gkey</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span> <span class="k">if</span> <span class="n">gkey</span><span class="o">.</span><span class="n">_psdf</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">should_drop_index</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">level</span><span class="o">=</span><span class="n">should_drop_index</span><span class="p">,</span> <span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">should_drop_index</span><span class="p">)</span> <span class="o">&lt;</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">):</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_handle_output</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">agg_column_names</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_prepare_reduce</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">groupkey_names</span><span class="p">:</span> <span class="n">List</span><span class="p">,</span>
<span class="n">accepted_spark_types</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Type</span><span class="p">[</span><span class="n">DataType</span><span class="p">],</span> <span class="o">...</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">bool_to_numeric</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">InternalFrame</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span> <span class="n">SparkDataFrame</span><span class="p">]:</span>
<span class="n">groupkey_scols</span> <span class="o">=</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)]</span>
<span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">:</span>
<span class="k">if</span> <span class="n">bool_to_numeric</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span>
<span class="n">agg_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">int</span><span class="p">))</span>
<span class="k">elif</span> <span class="p">(</span><span class="n">accepted_spark_types</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span>
<span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">accepted_spark_types</span>
<span class="p">):</span>
<span class="n">agg_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="o">*</span><span class="n">groupkey_scols</span><span class="p">,</span>
<span class="o">*</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">],</span>
<span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span>
<span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">],</span>
<span class="n">index_fields</span><span class="o">=</span><span class="p">[</span>
<span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span>
<span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span>
<span class="p">],</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">],</span>
<span class="n">column_label_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">internal</span><span class="p">,</span> <span class="n">agg_columns</span><span class="p">,</span> <span class="n">sdf</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_resolve_grouping_from_diff_dataframes</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">Label</span><span class="p">]]</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span> <span class="n">Set</span><span class="p">[</span><span class="n">Label</span><span class="p">]]:</span>
<span class="n">column_labels_level</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">additional_pssers</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">additional_column_labels</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">tmp_column_labels</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">col_or_s</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">by</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="k">if</span> <span class="n">col_or_s</span><span class="o">.</span><span class="n">_psdf</span> <span class="ow">is</span> <span class="n">psdf</span><span class="p">:</span>
<span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col_or_s</span><span class="o">.</span><span class="n">_column_label</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="n">psdf</span><span class="p">):</span>
<span class="n">temp_label</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="s2">&quot;__tmp_groupkey_</span><span class="si">{}</span><span class="s2">__&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">))</span>
<span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">temp_label</span><span class="p">)</span>
<span class="n">additional_pssers</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col_or_s</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">temp_label</span><span class="p">))</span>
<span class="n">additional_column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">temp_label</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">temp_label</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="nb">tuple</span><span class="p">(</span>
<span class="p">([</span><span class="s2">&quot;&quot;</span><span class="p">]</span> <span class="o">*</span> <span class="p">(</span><span class="n">column_labels_level</span> <span class="o">-</span> <span class="mi">1</span><span class="p">))</span> <span class="o">+</span> <span class="p">[</span><span class="s2">&quot;__tmp_groupkey_</span><span class="si">{}</span><span class="s2">__&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">)]</span>
<span class="p">),</span>
<span class="p">)</span>
<span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">temp_label</span><span class="p">)</span>
<span class="n">tmp_column_labels</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">temp_label</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span>
<span class="n">psser</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">col_or_s</span><span class="p">]</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">))</span>
<span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span>
<span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">]</span>
<span class="o">+</span> <span class="n">additional_pssers</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">assign_columns</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">this_column_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span> <span class="n">that_column_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">Label</span><span class="p">]]:</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span>
<span class="s2">&quot;Duplicated labels with groupby() and &quot;</span>
<span class="s2">&quot;&#39;compute.ops_on_diff_frames&#39; option is not supported currently &quot;</span>
<span class="s2">&quot;Please use unique labels in series and frames.&quot;</span>
<span class="p">)</span>
<span class="k">for</span> <span class="n">col_or_s</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">by</span><span class="p">,</span> <span class="n">column_labels</span><span class="p">):</span>
<span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">tmp_column_labels</span><span class="p">:</span>
<span class="n">psser</span> <span class="o">=</span> <span class="n">col_or_s</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">align_diff_frames</span><span class="p">(</span>
<span class="n">assign_columns</span><span class="p">,</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="n">psser</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">label</span><span class="p">),</span>
<span class="n">fillna</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">how</span><span class="o">=</span><span class="s2">&quot;inner&quot;</span><span class="p">,</span>
<span class="n">preserve_order_column</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">tmp_column_labels</span> <span class="o">|=</span> <span class="nb">set</span><span class="p">(</span><span class="n">additional_column_labels</span><span class="p">)</span>
<span class="n">new_by_series</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">col_or_s</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">by</span><span class="p">,</span> <span class="n">column_labels</span><span class="p">):</span>
<span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">tmp_column_labels</span><span class="p">:</span>
<span class="n">psser</span> <span class="o">=</span> <span class="n">col_or_s</span>
<span class="n">new_by_series</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">name</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">new_by_series</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">))</span>
<span class="k">return</span> <span class="n">psdf</span><span class="p">,</span> <span class="n">new_by_series</span><span class="p">,</span> <span class="n">tmp_column_labels</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_resolve_grouping</span><span class="p">(</span><span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">Label</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">]:</span>
<span class="n">new_by_series</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">col_or_s</span> <span class="ow">in</span> <span class="n">by</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="n">new_by_series</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span>
<span class="n">psser</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">col_or_s</span><span class="p">]</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">))</span>
<span class="n">new_by_series</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">)</span>
<span class="k">return</span> <span class="n">new_by_series</span>
<span class="k">class</span> <span class="nc">DataFrameGroupBy</span><span class="p">(</span><span class="n">GroupBy</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">]):</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_build</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">Label</span><span class="p">]],</span> <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrameGroupBy&quot;</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">col_or_s</span><span class="p">)</span> <span class="k">for</span> <span class="n">col_or_s</span> <span class="ow">in</span> <span class="n">by</span><span class="p">):</span>
<span class="p">(</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="n">new_by_series</span><span class="p">,</span>
<span class="n">column_labels_to_exclude</span><span class="p">,</span>
<span class="p">)</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_resolve_grouping_from_diff_dataframes</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">by</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">new_by_series</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_resolve_grouping</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">by</span><span class="p">)</span>
<span class="n">column_labels_to_exclude</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
<span class="k">return</span> <span class="n">DataFrameGroupBy</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="n">new_by_series</span><span class="p">,</span>
<span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span>
<span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">,</span>
<span class="n">column_labels_to_exclude</span><span class="o">=</span><span class="n">column_labels_to_exclude</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span>
<span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span>
<span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span>
<span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span>
<span class="n">column_labels_to_exclude</span><span class="p">:</span> <span class="n">Set</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span>
<span class="n">agg_columns</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="n">agg_columns_selected</span> <span class="o">=</span> <span class="n">agg_columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="k">if</span> <span class="n">agg_columns_selected</span><span class="p">:</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">:</span>
<span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels_to_exclude</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">label</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">any</span><span class="p">(</span><span class="n">label</span> <span class="o">==</span> <span class="n">key</span><span class="o">.</span><span class="n">_column_label</span> <span class="ow">and</span> <span class="n">key</span><span class="o">.</span><span class="n">_psdf</span> <span class="ow">is</span> <span class="n">psdf</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">by</span><span class="p">)</span>
<span class="ow">and</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">column_labels_to_exclude</span>
<span class="p">]</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">=</span><span class="n">psdf</span><span class="p">,</span>
<span class="n">groupkeys</span><span class="o">=</span><span class="n">by</span><span class="p">,</span>
<span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span>
<span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">,</span>
<span class="n">column_labels_to_exclude</span><span class="o">=</span><span class="n">column_labels_to_exclude</span><span class="p">,</span>
<span class="n">agg_columns_selected</span><span class="o">=</span><span class="n">agg_columns_selected</span><span class="p">,</span>
<span class="n">agg_columns</span><span class="o">=</span><span class="p">[</span><span class="n">psdf</span><span class="p">[</span><span class="n">label</span><span class="p">]</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">],</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__getattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Any</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">MissingPandasLikeDataFrameGroupBy</span><span class="p">,</span> <span class="n">item</span><span class="p">):</span>
<span class="n">property_or_func</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">MissingPandasLikeDataFrameGroupBy</span><span class="p">,</span> <span class="n">item</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">property_or_func</span><span class="p">,</span> <span class="nb">property</span><span class="p">):</span>
<span class="k">return</span> <span class="n">property_or_func</span><span class="o">.</span><span class="n">fget</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">partial</span><span class="p">(</span><span class="n">property_or_func</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="fm">__getitem__</span><span class="p">(</span><span class="n">item</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">GroupBy</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span> <span class="ow">and</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">item</span><span class="p">):</span>
<span class="k">return</span> <span class="n">SeriesGroupBy</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">item</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">item</span><span class="p">,)),</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span>
<span class="n">dropna</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">item</span><span class="p">):</span>
<span class="n">item</span> <span class="o">=</span> <span class="p">[</span><span class="n">item</span><span class="p">]</span>
<span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">item</span><span class="p">):</span>
<span class="n">item</span> <span class="o">=</span> <span class="p">[(</span><span class="n">item</span><span class="p">,)]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">item</span> <span class="o">=</span> <span class="p">[</span><span class="n">i</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">i</span><span class="p">,)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">item</span><span class="p">]</span>
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span><span class="p">:</span>
<span class="n">groupkey_names</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">key</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span>
<span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">item</span><span class="p">:</span>
<span class="k">if</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;cannot insert </span><span class="si">{}</span><span class="s2">, already exists&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">name</span><span class="p">))</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">DataFrameGroupBy</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span>
<span class="n">as_index</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span><span class="p">,</span>
<span class="n">dropna</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span><span class="p">,</span>
<span class="n">column_labels_to_exclude</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_column_labels_to_exclude</span><span class="p">,</span>
<span class="n">agg_columns</span><span class="o">=</span><span class="n">item</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">_apply_series_op</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">op</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">&quot;SeriesGroupBy&quot;</span><span class="p">],</span> <span class="n">Series</span><span class="p">],</span>
<span class="n">should_resolve</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="n">applied</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">column</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">:</span>
<span class="n">applied</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">(</span><span class="n">column</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span>
<span class="k">if</span> <span class="n">numeric_only</span><span class="p">:</span>
<span class="n">applied</span> <span class="o">=</span> <span class="p">[</span><span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">applied</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">)]</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">applied</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">DataError</span><span class="p">(</span><span class="s2">&quot;No numeric types to aggregate&quot;</span><span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span><span class="n">applied</span><span class="p">,</span> <span class="n">keep_order</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="k">if</span> <span class="n">should_resolve</span><span class="p">:</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">resolved_copy</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_handle_output</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">agg_column_names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="k">if</span> <span class="n">agg_column_names</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">psdf</span><span class="p">[</span><span class="n">agg_column_names</span><span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">psdf</span>
<span class="c1"># TODO: Implement &#39;percentiles&#39;, &#39;include&#39;, and &#39;exclude&#39; arguments.</span>
<span class="c1"># TODO: Add ``DataFrame.select_dtypes`` to See Also when &#39;include&#39;</span>
<span class="c1"># and &#39;exclude&#39; arguments are implemented.</span>
<div class="viewcode-block" id="DataFrameGroupBy.describe"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.DataFrameGroupBy.describe.html#pyspark.pandas.groupby.DataFrameGroupBy.describe">[docs]</a> <span class="k">def</span> <span class="nf">describe</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Generate descriptive statistics that summarize the central tendency,</span>
<span class="sd"> dispersion and shape of a dataset&#39;s distribution, excluding</span>
<span class="sd"> ``NaN`` values.</span>
<span class="sd"> Analyzes both numeric and object series, as well</span>
<span class="sd"> as ``DataFrame`` column sets of mixed data types. The output</span>
<span class="sd"> will vary depending on what is provided. Refer to the notes</span>
<span class="sd"> below for more detail.</span>
<span class="sd"> .. note:: Unlike pandas, the percentiles in pandas-on-Spark are based upon</span>
<span class="sd"> approximate percentile computation because computing percentiles</span>
<span class="sd"> across a large dataset is extremely expensive.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> Summary statistics of the DataFrame provided.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.count</span>
<span class="sd"> DataFrame.max</span>
<span class="sd"> DataFrame.min</span>
<span class="sd"> DataFrame.mean</span>
<span class="sd"> DataFrame.std</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 1, 3], &#39;b&#39;: [4, 5, 6], &#39;c&#39;: [7, 8, 9]})</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> a b c</span>
<span class="sd"> 0 1 4 7</span>
<span class="sd"> 1 1 5 8</span>
<span class="sd"> 2 3 6 9</span>
<span class="sd"> Describing a ``DataFrame``. By default only numeric fields</span>
<span class="sd"> are returned.</span>
<span class="sd"> &gt;&gt;&gt; described = df.groupby(&#39;a&#39;).describe()</span>
<span class="sd"> &gt;&gt;&gt; described.sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> b c</span>
<span class="sd"> count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max</span>
<span class="sd"> a</span>
<span class="sd"> 1 2.0 4.5 0.707107 4.0 4.0 4.0 5.0 5.0 2.0 7.5 0.707107 7.0 7.0 7.0 8.0 8.0</span>
<span class="sd"> 3 1.0 6.0 NaN 6.0 6.0 6.0 6.0 6.0 1.0 9.0 NaN 9.0 9.0 9.0 9.0 9.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">StringType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span>
<span class="s2">&quot;DataFrameGroupBy.describe() doesn&#39;t support for string type for now&quot;</span>
<span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">aggregate</span><span class="p">([</span><span class="s2">&quot;count&quot;</span><span class="p">,</span> <span class="s2">&quot;mean&quot;</span><span class="p">,</span> <span class="s2">&quot;std&quot;</span><span class="p">,</span> <span class="s2">&quot;min&quot;</span><span class="p">,</span> <span class="s2">&quot;quartiles&quot;</span><span class="p">,</span> <span class="s2">&quot;max&quot;</span><span class="p">])</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span>
<span class="n">agg_column_labels</span> <span class="o">=</span> <span class="p">[</span><span class="n">col</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">]</span>
<span class="n">formatted_percentiles</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;25%&quot;</span><span class="p">,</span> <span class="s2">&quot;50%&quot;</span><span class="p">,</span> <span class="s2">&quot;75%&quot;</span><span class="p">]</span>
<span class="c1"># Split &quot;quartiles&quot; columns into first, second, and third quartiles.</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">agg_column_labels</span><span class="p">:</span>
<span class="n">quartiles_col</span> <span class="o">=</span> <span class="n">name_like_string</span><span class="p">(</span><span class="nb">tuple</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">+</span> <span class="p">[</span><span class="s2">&quot;quartiles&quot;</span><span class="p">]))</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">percentile</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">formatted_percentiles</span><span class="p">):</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span>
<span class="n">name_like_string</span><span class="p">(</span><span class="nb">tuple</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">+</span> <span class="p">[</span><span class="n">percentile</span><span class="p">])),</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">quartiles_col</span><span class="p">)[</span><span class="n">i</span><span class="p">],</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">quartiles_col</span><span class="p">)</span>
<span class="c1"># Reorder columns lexicographically by agg column followed by stats.</span>
<span class="n">stats</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;count&quot;</span><span class="p">,</span> <span class="s2">&quot;mean&quot;</span><span class="p">,</span> <span class="s2">&quot;std&quot;</span><span class="p">,</span> <span class="s2">&quot;min&quot;</span><span class="p">]</span> <span class="o">+</span> <span class="n">formatted_percentiles</span> <span class="o">+</span> <span class="p">[</span><span class="s2">&quot;max&quot;</span><span class="p">]</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span><span class="nb">tuple</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">+</span> <span class="p">[</span><span class="n">s</span><span class="p">])</span> <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">product</span><span class="p">(</span><span class="n">agg_column_labels</span><span class="p">,</span> <span class="n">stats</span><span class="p">)]</span>
<span class="n">data_columns</span> <span class="o">=</span> <span class="nb">map</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">,</span> <span class="n">column_labels</span><span class="p">)</span>
<span class="c1"># Reindex the DataFrame to reflect initial grouping and agg columns.</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">,</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_columns</span><span class="p">],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="p">)</span>
<span class="c1"># Cast columns to ``&quot;float64&quot;`` to match `pandas.DataFrame.groupby`.</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s2">&quot;float64&quot;</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">corr</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">method</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;pearson&quot;</span><span class="p">,</span>
<span class="n">min_periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute pairwise correlation of columns, excluding NA/null values.</span>
<span class="sd"> .. versionadded:: 4.0.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> method : {&#39;pearson&#39;, &#39;spearman&#39;, &#39;kendall&#39;}</span>
<span class="sd"> * pearson : standard correlation coefficient</span>
<span class="sd"> * spearman : Spearman rank correlation</span>
<span class="sd"> * kendall : Kendall Tau correlation coefficient</span>
<span class="sd"> min_periods : int, default 1</span>
<span class="sd"> Minimum number of observations in window required to have a value</span>
<span class="sd"> (otherwise result is NA).</span>
<span class="sd"> numeric_only : bool, default False</span>
<span class="sd"> Include only `float`, `int` or `boolean` data.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.corrwith</span>
<span class="sd"> Series.corr</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> 1. Pearson, Kendall and Spearman correlation are currently computed using pairwise</span>
<span class="sd"> complete observations.</span>
<span class="sd"> 2. The complexity of Kendall correlation is O(#row * #row), if the dataset is too</span>
<span class="sd"> large, sampling ahead of correlation computation is recommended.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame(</span>
<span class="sd"> ... {&quot;A&quot;: [0, 0, 0, 1, 1, 2], &quot;B&quot;: [-1, 2, 3, 5, 6, 0], &quot;C&quot;: [4, 6, 5, 1, 3, 0]},</span>
<span class="sd"> ... columns=[&quot;A&quot;, &quot;B&quot;, &quot;C&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).corr()</span>
<span class="sd"> B C</span>
<span class="sd"> A</span>
<span class="sd"> 0 B 1.000000 0.720577</span>
<span class="sd"> C 0.720577 1.000000</span>
<span class="sd"> 1 B 1.000000 1.000000</span>
<span class="sd"> C 1.000000 1.000000</span>
<span class="sd"> 2 B NaN NaN</span>
<span class="sd"> C NaN NaN</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).corr(min_periods=2)</span>
<span class="sd"> B C</span>
<span class="sd"> A</span>
<span class="sd"> 0 B 1.000000 0.720577</span>
<span class="sd"> C 0.720577 1.000000</span>
<span class="sd"> 1 B 1.000000 1.000000</span>
<span class="sd"> C 1.000000 1.000000</span>
<span class="sd"> 2 B NaN NaN</span>
<span class="sd"> C NaN NaN</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).corr(&quot;spearman&quot;)</span>
<span class="sd"> B C</span>
<span class="sd"> A</span>
<span class="sd"> 0 B 1.0 0.5</span>
<span class="sd"> C 0.5 1.0</span>
<span class="sd"> 1 B 1.0 1.0</span>
<span class="sd"> C 1.0 1.0</span>
<span class="sd"> 2 B NaN NaN</span>
<span class="sd"> C NaN NaN</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).corr(&#39;kendall&#39;)</span>
<span class="sd"> B C</span>
<span class="sd"> A</span>
<span class="sd"> 0 B 1.000000 0.333333</span>
<span class="sd"> C 0.333333 1.000000</span>
<span class="sd"> 1 B 1.000000 1.000000</span>
<span class="sd"> C 1.000000 1.000000</span>
<span class="sd"> 2 B 1.000000 NaN</span>
<span class="sd"> C NaN 1.000000</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">method</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">&quot;pearson&quot;</span><span class="p">,</span> <span class="s2">&quot;spearman&quot;</span><span class="p">,</span> <span class="s2">&quot;kendall&quot;</span><span class="p">]:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;Invalid method </span><span class="si">{</span><span class="n">method</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">)</span>
<span class="n">groupkey_names</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="nb">str</span><span class="p">(</span><span class="n">key</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">]</span>
<span class="n">internal</span><span class="p">,</span> <span class="n">agg_columns</span><span class="p">,</span> <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prepare_reduce</span><span class="p">(</span>
<span class="n">groupkey_names</span><span class="o">=</span><span class="n">groupkey_names</span><span class="p">,</span>
<span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">)</span> <span class="k">if</span> <span class="n">numeric_only</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">bool_to_numeric</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">numeric_labels</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">label</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">internal</span><span class="o">.</span><span class="n">spark_type_for</span><span class="p">(</span><span class="n">label</span><span class="p">),</span> <span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">))</span>
<span class="p">]</span>
<span class="n">numeric_scols</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">&quot;double&quot;</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">numeric_labels</span>
<span class="p">]</span>
<span class="n">numeric_col_names</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">numeric_labels</span><span class="p">]</span>
<span class="n">num_scols</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">numeric_scols</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span>
<span class="n">index_1_col_name</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">&quot;__groupby_corr_index_1_temp_column__&quot;</span><span class="p">)</span>
<span class="n">index_2_col_name</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">&quot;__groupby_corr_index_2_temp_column__&quot;</span><span class="p">)</span>
<span class="n">pair_scols</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">num_scols</span><span class="p">):</span>
<span class="k">for</span> <span class="n">j</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="n">num_scols</span><span class="p">):</span>
<span class="n">pair_scols</span><span class="o">.</span><span class="n">append</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">struct</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">i</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">index_1_col_name</span><span class="p">),</span>
<span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">j</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">index_2_col_name</span><span class="p">),</span>
<span class="n">numeric_scols</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">CORRELATION_VALUE_1_COLUMN</span><span class="p">),</span>
<span class="n">numeric_scols</span><span class="p">[</span><span class="n">j</span><span class="p">]</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">CORRELATION_VALUE_2_COLUMN</span><span class="p">),</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="p">[</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span> <span class="o">*</span><span class="p">[</span><span class="n">F</span><span class="o">.</span><span class="n">inline</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="o">*</span><span class="n">pair_scols</span><span class="p">))])</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">compute</span><span class="p">(</span>
<span class="n">sdf</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">groupKeys</span><span class="o">=</span><span class="n">groupkey_names</span> <span class="o">+</span> <span class="p">[</span><span class="n">index_1_col_name</span><span class="p">,</span> <span class="n">index_2_col_name</span><span class="p">],</span> <span class="n">method</span><span class="o">=</span><span class="n">method</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">method</span> <span class="o">==</span> <span class="s2">&quot;kendall&quot;</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span>
<span class="n">CORRELATION_CORR_OUTPUT_COLUMN</span><span class="p">,</span>
<span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">index_1_col_name</span><span class="p">)</span> <span class="o">==</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">index_2_col_name</span><span class="p">),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="mf">1.0</span><span class="p">))</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">CORRELATION_CORR_OUTPUT_COLUMN</span><span class="p">)</span>
<span class="p">),</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span>
<span class="n">CORRELATION_CORR_OUTPUT_COLUMN</span><span class="p">,</span>
<span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">CORRELATION_COUNT_OUTPUT_COLUMN</span><span class="p">)</span> <span class="o">&lt;</span> <span class="n">min_periods</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">))</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">CORRELATION_CORR_OUTPUT_COLUMN</span><span class="p">)</span>
<span class="p">),</span>
<span class="p">)</span>
<span class="n">auxiliary_col_name</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">&quot;__groupby_corr_auxiliary_temp_column__&quot;</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span>
<span class="n">auxiliary_col_name</span><span class="p">,</span>
<span class="n">F</span><span class="o">.</span><span class="n">explode</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">index_1_col_name</span><span class="p">)</span> <span class="o">==</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">index_2_col_name</span><span class="p">),</span>
<span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">([</span><span class="mi">0</span><span class="p">]),</span>
<span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">]))</span>
<span class="p">),</span>
<span class="p">)</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="o">*</span><span class="p">[</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span>
<span class="o">*</span><span class="p">[</span>
<span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">auxiliary_col_name</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">index_1_col_name</span><span class="p">))</span>
<span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">index_2_col_name</span><span class="p">))</span>
<span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">index_1_col_name</span><span class="p">),</span>
<span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">auxiliary_col_name</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">index_2_col_name</span><span class="p">))</span>
<span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">index_1_col_name</span><span class="p">))</span>
<span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">index_2_col_name</span><span class="p">),</span>
<span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">CORRELATION_CORR_OUTPUT_COLUMN</span><span class="p">),</span>
<span class="p">],</span>
<span class="p">)</span>
<span class="n">array_col_name</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">&quot;__groupby_corr_array_temp_column__&quot;</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span> <span class="o">+</span> <span class="p">[</span><span class="n">index_1_col_name</span><span class="p">])</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">array_sort</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">collect_list</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">struct</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">index_2_col_name</span><span class="p">),</span>
<span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">CORRELATION_CORR_OUTPUT_COLUMN</span><span class="p">),</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">array_col_name</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">num_scols</span><span class="p">):</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">auxiliary_col_name</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">array_col_name</span><span class="p">),</span> <span class="n">i</span><span class="p">))</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span>
<span class="n">numeric_col_names</span><span class="p">[</span><span class="n">i</span><span class="p">],</span>
<span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="sa">f</span><span class="s2">&quot;</span><span class="si">{</span><span class="n">auxiliary_col_name</span><span class="si">}</span><span class="s2">.</span><span class="si">{</span><span class="n">CORRELATION_CORR_OUTPUT_COLUMN</span><span class="si">}</span><span class="s2">&quot;</span><span class="p">),</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">groupkey_names</span> <span class="o">+</span> <span class="p">[</span><span class="n">index_1_col_name</span><span class="p">])</span> <span class="c1"># type: ignore[arg-type]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="o">*</span><span class="p">[</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span> <span class="o">+</span> <span class="n">numeric_col_names</span><span class="p">],</span>
<span class="o">*</span><span class="p">[</span>
<span class="n">F</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">numeric_col_names</span><span class="p">),</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">index_1_col_name</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">auxiliary_col_name</span><span class="p">),</span>
<span class="n">F</span><span class="o">.</span><span class="n">monotonically_increasing_id</span><span class="p">()</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">),</span>
<span class="p">],</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">key</span><span class="p">)</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">groupkey_names</span> <span class="o">+</span> <span class="p">[</span><span class="n">auxiliary_col_name</span><span class="p">]</span>
<span class="p">],</span>
<span class="n">index_names</span><span class="o">=</span><span class="p">(</span>
<span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">]</span>
<span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span>
<span class="p">),</span>
<span class="n">column_labels</span><span class="o">=</span><span class="n">numeric_labels</span><span class="p">,</span>
<span class="n">column_label_names</span><span class="o">=</span><span class="n">internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">,</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">class</span> <span class="nc">SeriesGroupBy</span><span class="p">(</span><span class="n">GroupBy</span><span class="p">[</span><span class="n">Series</span><span class="p">]):</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_build</span><span class="p">(</span>
<span class="n">psser</span><span class="p">:</span> <span class="n">Series</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">Label</span><span class="p">]],</span> <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SeriesGroupBy&quot;</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">any</span><span class="p">(</span>
<span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">psser</span><span class="p">,</span> <span class="n">col_or_s</span><span class="p">)</span> <span class="k">for</span> <span class="n">col_or_s</span> <span class="ow">in</span> <span class="n">by</span>
<span class="p">):</span>
<span class="n">psdf</span><span class="p">,</span> <span class="n">new_by_series</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_resolve_grouping_from_diff_dataframes</span><span class="p">(</span>
<span class="n">psser</span><span class="o">.</span><span class="n">to_frame</span><span class="p">(),</span> <span class="n">by</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">SeriesGroupBy</span><span class="p">(</span>
<span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">name</span><span class="p">),</span>
<span class="n">new_by_series</span><span class="p">,</span>
<span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span>
<span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">new_by_series</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_resolve_grouping</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span> <span class="n">by</span><span class="p">)</span>
<span class="k">return</span> <span class="n">SeriesGroupBy</span><span class="p">(</span><span class="n">psser</span><span class="p">,</span> <span class="n">new_by_series</span><span class="p">,</span> <span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span> <span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">psser</span><span class="p">:</span> <span class="n">Series</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span> <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">):</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">as_index</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;as_index=False only valid with DataFrame&quot;</span><span class="p">)</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">=</span><span class="n">psser</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span>
<span class="n">groupkeys</span><span class="o">=</span><span class="n">by</span><span class="p">,</span>
<span class="n">as_index</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">,</span>
<span class="n">column_labels_to_exclude</span><span class="o">=</span><span class="nb">set</span><span class="p">(),</span>
<span class="n">agg_columns_selected</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">agg_columns</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="p">],</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_psser</span> <span class="o">=</span> <span class="n">psser</span>
<span class="k">def</span> <span class="fm">__getattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Any</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">MissingPandasLikeSeriesGroupBy</span><span class="p">,</span> <span class="n">item</span><span class="p">):</span>
<span class="n">property_or_func</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">MissingPandasLikeSeriesGroupBy</span><span class="p">,</span> <span class="n">item</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">property_or_func</span><span class="p">,</span> <span class="nb">property</span><span class="p">):</span>
<span class="k">return</span> <span class="n">property_or_func</span><span class="o">.</span><span class="n">fget</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">partial</span><span class="p">(</span><span class="n">property_or_func</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span>
<span class="k">raise</span> <span class="ne">AttributeError</span><span class="p">(</span><span class="n">item</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_apply_series_op</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">op</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">&quot;SeriesGroupBy&quot;</span><span class="p">],</span> <span class="n">Series</span><span class="p">],</span>
<span class="n">should_resolve</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Series</span><span class="p">:</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span>
<span class="k">raise</span> <span class="n">DataError</span><span class="p">(</span><span class="s2">&quot;No numeric types to aggregate&quot;</span><span class="p">)</span>
<span class="n">psser</span> <span class="o">=</span> <span class="n">op</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="k">if</span> <span class="n">should_resolve</span><span class="p">:</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span>
<span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">psser</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">_handle_output</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">agg_column_names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Series</span><span class="p">:</span>
<span class="k">if</span> <span class="n">agg_column_names</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">psdf</span><span class="p">[</span><span class="n">agg_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]]</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">agg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">MissingPandasLikeSeriesGroupBy</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">aggregate</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">MissingPandasLikeSeriesGroupBy</span><span class="o">.</span><span class="n">aggregate</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">size</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Series</span><span class="p">:</span>
<span class="k">return</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">size</span><span class="p">()</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="n">size</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">size</span><span class="o">.</span><span class="vm">__doc__</span>
<span class="c1"># TODO: add keep parameter</span>
<div class="viewcode-block" id="SeriesGroupBy.nsmallest"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.SeriesGroupBy.nsmallest.html#pyspark.pandas.groupby.SeriesGroupBy.nsmallest">[docs]</a> <span class="k">def</span> <span class="nf">nsmallest</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Series</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the smallest `n` elements.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> n : int</span>
<span class="sd"> Number of items to retrieve.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.nsmallest</span>
<span class="sd"> pyspark.pandas.DataFrame.nsmallest</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 1, 1, 2, 2, 2, 3, 3, 3],</span>
<span class="sd"> ... &#39;b&#39;: [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;a&#39;])[&#39;b&#39;].nsmallest(1).sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> a</span>
<span class="sd"> 1 0 1</span>
<span class="sd"> 2 3 2</span>
<span class="sd"> 3 6 3</span>
<span class="sd"> Name: b, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">&gt;</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;nsmallest do not support multi-index now&quot;</span><span class="p">)</span>
<span class="n">groupkey_col_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="o">*</span><span class="p">[</span><span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">scol</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">,</span> <span class="n">groupkey_col_names</span><span class="p">)],</span>
<span class="o">*</span><span class="p">[</span>
<span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">scol</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">)</span>
<span class="p">],</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span>
<span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_col_names</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">.</span><span class="n">asc</span><span class="p">(),</span>
<span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">temp_rank_column</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">&quot;__rank__&quot;</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">))</span>
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="n">n</span><span class="p">)</span>
<span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">)</span>
<span class="p">)</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">(</span>
<span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_col_names</span><span class="p">]</span>
<span class="o">+</span> <span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">)</span>
<span class="p">]</span>
<span class="p">),</span>
<span class="n">index_names</span><span class="o">=</span><span class="p">(</span>
<span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">]</span>
<span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span>
<span class="p">),</span>
<span class="n">index_fields</span><span class="o">=</span><span class="p">(</span>
<span class="p">[</span>
<span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_col_names</span><span class="p">)</span>
<span class="p">]</span>
<span class="o">+</span> <span class="p">[</span>
<span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">field</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">)</span>
<span class="p">]</span>
<span class="p">),</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_column_label</span><span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]],</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div>
<span class="c1"># TODO: add keep parameter</span>
<div class="viewcode-block" id="SeriesGroupBy.nlargest"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.SeriesGroupBy.nlargest.html#pyspark.pandas.groupby.SeriesGroupBy.nlargest">[docs]</a> <span class="k">def</span> <span class="nf">nlargest</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Series</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the first n rows ordered by columns in descending order in group.</span>
<span class="sd"> Return the first n rows with the smallest values in columns, in descending order.</span>
<span class="sd"> The columns that are not specified are returned as well, but not used for ordering.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> n : int</span>
<span class="sd"> Number of items to retrieve.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.nlargest</span>
<span class="sd"> pyspark.pandas.DataFrame.nlargest</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 1, 1, 2, 2, 2, 3, 3, 3],</span>
<span class="sd"> ... &#39;b&#39;: [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;a&#39;])[&#39;b&#39;].nlargest(1).sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> a</span>
<span class="sd"> 1 1 2</span>
<span class="sd"> 2 4 3</span>
<span class="sd"> 3 7 4</span>
<span class="sd"> Name: b, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">&gt;</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;nlargest do not support multi-index now&quot;</span><span class="p">)</span>
<span class="n">groupkey_col_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="o">*</span><span class="p">[</span><span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">scol</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">,</span> <span class="n">groupkey_col_names</span><span class="p">)],</span>
<span class="o">*</span><span class="p">[</span>
<span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">scol</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">)</span>
<span class="p">],</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span>
<span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_col_names</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">.</span><span class="n">desc</span><span class="p">(),</span>
<span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">temp_rank_column</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">&quot;__rank__&quot;</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">))</span>
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="n">n</span><span class="p">)</span>
<span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">)</span>
<span class="p">)</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">(</span>
<span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_col_names</span><span class="p">]</span>
<span class="o">+</span> <span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">)</span>
<span class="p">]</span>
<span class="p">),</span>
<span class="n">index_names</span><span class="o">=</span><span class="p">(</span>
<span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">]</span>
<span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span>
<span class="p">),</span>
<span class="n">index_fields</span><span class="o">=</span><span class="p">(</span>
<span class="p">[</span>
<span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_col_names</span><span class="p">)</span>
<span class="p">]</span>
<span class="o">+</span> <span class="p">[</span>
<span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">field</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">)</span>
<span class="p">]</span>
<span class="p">),</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_column_label</span><span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]],</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div>
<span class="c1"># TODO: add bins, normalize parameter</span>
<div class="viewcode-block" id="SeriesGroupBy.value_counts"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.SeriesGroupBy.value_counts.html#pyspark.pandas.groupby.SeriesGroupBy.value_counts">[docs]</a> <span class="k">def</span> <span class="nf">value_counts</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">sort</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">ascending</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Series</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute group sizes.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> sort : boolean, default None</span>
<span class="sd"> Sort by frequencies.</span>
<span class="sd"> ascending : boolean, default False</span>
<span class="sd"> Sort in ascending order.</span>
<span class="sd"> dropna : boolean, default True</span>
<span class="sd"> Don&#39;t include counts of NaN.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [1, 2, 2, 3, 3, 3],</span>
<span class="sd"> ... &#39;B&#39;: [1, 1, 2, 3, 3, np.nan]},</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B</span>
<span class="sd"> 0 1 1.0</span>
<span class="sd"> 1 2 1.0</span>
<span class="sd"> 2 2 2.0</span>
<span class="sd"> 3 3 3.0</span>
<span class="sd"> 4 3 3.0</span>
<span class="sd"> 5 3 NaN</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;)[&#39;B&#39;].value_counts().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> A B</span>
<span class="sd"> 1 1.0 1</span>
<span class="sd"> 2 1.0 1</span>
<span class="sd"> 2.0 1</span>
<span class="sd"> 3 3.0 2</span>
<span class="sd"> Name: count, dtype: int64</span>
<span class="sd"> Don&#39;t include counts of NaN when dropna is False.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;)[&#39;B&#39;].value_counts(</span>
<span class="sd"> ... dropna=False).sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> A B</span>
<span class="sd"> 1 1.0 1</span>
<span class="sd"> 2 1.0 1</span>
<span class="sd"> 2.0 1</span>
<span class="sd"> 3 3.0 2</span>
<span class="sd"> NaN 1</span>
<span class="sd"> Name: count, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;The resulting Series will have a fixed name of &#39;count&#39; from 4.0.0.&quot;</span><span class="p">,</span>
<span class="ne">FutureWarning</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">groupkeys</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span>
<span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">))]</span>
<span class="n">groupkey_cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span>
<span class="n">agg_column</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_cols</span><span class="p">)</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">withColumnRenamed</span><span class="p">(</span><span class="s2">&quot;count&quot;</span><span class="p">,</span> <span class="n">agg_column</span><span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span><span class="p">:</span>
<span class="n">_groupkey_column_names</span> <span class="o">=</span> <span class="n">groupkey_names</span><span class="p">[:</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span><span class="n">subset</span><span class="o">=</span><span class="n">_groupkey_column_names</span><span class="p">)</span>
<span class="k">if</span> <span class="n">dropna</span><span class="p">:</span>
<span class="n">_agg_columns_names</span> <span class="o">=</span> <span class="n">groupkey_names</span><span class="p">[</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span> <span class="p">:]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span><span class="n">subset</span><span class="o">=</span><span class="n">_agg_columns_names</span><span class="p">)</span>
<span class="k">if</span> <span class="n">sort</span><span class="p">:</span>
<span class="k">if</span> <span class="n">ascending</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">agg_column</span><span class="p">)</span><span class="o">.</span><span class="n">asc</span><span class="p">())</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">agg_column</span><span class="p">)</span><span class="o">.</span><span class="n">desc</span><span class="p">())</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span>
<span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">groupkeys</span><span class="p">],</span>
<span class="n">index_fields</span><span class="o">=</span><span class="p">[</span>
<span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span>
<span class="p">],</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">[(</span><span class="s2">&quot;count&quot;</span><span class="p">,)],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">agg_column</span><span class="p">)],</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div>
<div class="viewcode-block" id="SeriesGroupBy.unique"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.SeriesGroupBy.unique.html#pyspark.pandas.groupby.SeriesGroupBy.unique">[docs]</a> <span class="k">def</span> <span class="nf">unique</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Series</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return unique values in group.</span>
<span class="sd"> Unique is returned in order of unknown. It does NOT sort.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.unique</span>
<span class="sd"> pyspark.pandas.Index.unique</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 1, 1, 2, 2, 2, 3, 3, 3],</span>
<span class="sd"> ... &#39;b&#39;: [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;a&#39;])[&#39;b&#39;].unique().sort_index() # doctest: +SKIP</span>
<span class="sd"> a</span>
<span class="sd"> 1 [1, 2]</span>
<span class="sd"> 2 [2, 3]</span>
<span class="sd"> 3 [3, 4]</span>
<span class="sd"> Name: b, dtype: object</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">collect_set</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">is_multi_agg_with_relabel</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Check whether the kwargs pass to .agg look like multi-agg with relabling.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> **kwargs : dict</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> bool</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; is_multi_agg_with_relabel(a=&#39;max&#39;)</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; is_multi_agg_with_relabel(a_max=(&#39;a&#39;, &#39;max&#39;),</span>
<span class="sd"> ... a_min=(&#39;a&#39;, &#39;min&#39;))</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; is_multi_agg_with_relabel()</span>
<span class="sd"> False</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">kwargs</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">False</span>
<span class="k">return</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">v</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">values</span><span class="p">())</span>
<span class="k">def</span> <span class="nf">normalize_keyword_aggregation</span><span class="p">(</span>
<span class="n">kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="nb">str</span><span class="p">]],</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="n">Tuple</span><span class="p">]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Normalize user-provided kwargs.</span>
<span class="sd"> Transforms from the new ``Dict[str, NamedAgg]`` style kwargs</span>
<span class="sd"> to the old defaultdict[str, List[scalar]].</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> kwargs : dict</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> aggspec : dict</span>
<span class="sd"> The transformed kwargs.</span>
<span class="sd"> columns : List[str]</span>
<span class="sd"> The user-provided keys.</span>
<span class="sd"> order : List[Tuple[str, str]]</span>
<span class="sd"> Pairs of the input and output column names.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; normalize_keyword_aggregation({&#39;output&#39;: (&#39;input&#39;, &#39;sum&#39;)})</span>
<span class="sd"> (defaultdict(&lt;class &#39;list&#39;&gt;, {&#39;input&#39;: [&#39;sum&#39;]}), [&#39;output&#39;], [(&#39;input&#39;, &#39;sum&#39;)])</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">aggspec</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">list</span><span class="p">)</span>
<span class="n">order</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Tuple</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">columns</span><span class="p">,</span> <span class="n">pairs</span> <span class="o">=</span> <span class="nb">zip</span><span class="p">(</span><span class="o">*</span><span class="n">kwargs</span><span class="o">.</span><span class="n">items</span><span class="p">())</span>
<span class="k">for</span> <span class="n">column</span><span class="p">,</span> <span class="n">aggfunc</span> <span class="ow">in</span> <span class="n">pairs</span><span class="p">:</span>
<span class="k">if</span> <span class="n">column</span> <span class="ow">in</span> <span class="n">aggspec</span><span class="p">:</span>
<span class="n">aggspec</span><span class="p">[</span><span class="n">column</span><span class="p">]</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">aggfunc</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">aggspec</span><span class="p">[</span><span class="n">column</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="n">aggfunc</span><span class="p">]</span>
<span class="n">order</span><span class="o">.</span><span class="n">append</span><span class="p">((</span><span class="n">column</span><span class="p">,</span> <span class="n">aggfunc</span><span class="p">))</span>
<span class="c1"># For MultiIndex, we need to flatten the tuple, e.g. ((&#39;y&#39;, &#39;A&#39;), &#39;max&#39;) needs to be</span>
<span class="c1"># flattened to (&#39;y&#39;, &#39;A&#39;, &#39;max&#39;), it won&#39;t do anything on normal Index.</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">order</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">],</span> <span class="nb">tuple</span><span class="p">):</span>
<span class="n">order</span> <span class="o">=</span> <span class="p">[(</span><span class="o">*</span><span class="n">levs</span><span class="p">,</span> <span class="n">method</span><span class="p">)</span> <span class="k">for</span> <span class="n">levs</span><span class="p">,</span> <span class="n">method</span> <span class="ow">in</span> <span class="n">order</span><span class="p">]</span>
<span class="k">return</span> <span class="n">aggspec</span><span class="p">,</span> <span class="nb">list</span><span class="p">(</span><span class="n">columns</span><span class="p">),</span> <span class="n">order</span>
<span class="k">def</span> <span class="nf">_test</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">doctest</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">import</span> <span class="nn">numpy</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span>
<span class="kn">import</span> <span class="nn">pyspark.pandas.groupby</span>
<span class="n">os</span><span class="o">.</span><span class="n">chdir</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;SPARK_HOME&quot;</span><span class="p">])</span>
<span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">groupby</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;np&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">numpy</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;ps&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span>
<span class="n">spark</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">&quot;local[4]&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">&quot;pyspark.pandas.groupby tests&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span>
<span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">groupby</span><span class="p">,</span>
<span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span>
<span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
<span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span>
<span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
<span class="n">_test</span><span class="p">()</span>
</pre></div>
</article>
<footer class="bd-footer-article">
<div class="footer-article-items footer-article__inner">
<div class="footer-article-item"><!-- Previous / next buttons -->
<div class="prev-next-area">
</div></div>
</div>
</footer>
</div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script>
<script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item"><p class="copyright">
Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
</p></div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item"><p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3.
</p></div>
</div>
</div>
</footer>
</body>
</html>