| |
| |
| <!DOCTYPE html> |
| |
| |
| <html > |
| |
| <head> |
| <meta charset="utf-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /> |
| <title>pyspark.pandas.groupby — PySpark 4.0.0-preview1 documentation</title> |
| |
| |
| |
| <script data-cfasync="false"> |
| document.documentElement.dataset.mode = localStorage.getItem("mode") || ""; |
| document.documentElement.dataset.theme = localStorage.getItem("theme") || "light"; |
| </script> |
| |
| <!-- Loaded before other Sphinx assets --> |
| <link href="../../../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../../../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../../../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| |
| |
| <link href="../../../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" /> |
| |
| <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" /> |
| |
| <!-- Pre-loaded scripts that we'll load fully later --> |
| <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52" /> |
| <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" /> |
| |
| <script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script> |
| <script src="../../../_static/jquery.js"></script> |
| <script src="../../../_static/underscore.js"></script> |
| <script src="../../../_static/doctools.js"></script> |
| <script src="../../../_static/clipboard.min.js"></script> |
| <script src="../../../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script>DOCUMENTATION_OPTIONS.pagename = '_modules/pyspark/pandas/groupby';</script> |
| <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/pandas/groupby.html" /> |
| <link rel="search" title="Search" href="../../../search.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="None"> |
| |
| |
| <!-- Matomo --> |
| <script type="text/javascript"> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| </head> |
| |
| |
| <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode=""> |
| |
| |
| |
| <a class="skip-link" href="#main-content">Skip to main content</a> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__primary" |
| id="__primary"/> |
| <label class="overlay overlay-primary" for="__primary"></label> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__secondary" |
| id="__secondary"/> |
| <label class="overlay overlay-secondary" for="__secondary"></label> |
| |
| <div class="search-button__wrapper"> |
| <div class="search-button__overlay"></div> |
| <div class="search-button__search-container"> |
| <form class="bd-search d-flex align-items-center" |
| action="../../../search.html" |
| method="get"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| <input type="search" |
| class="form-control" |
| name="q" |
| id="search-input" |
| placeholder="Search the docs ..." |
| aria-label="Search the docs ..." |
| autocomplete="off" |
| autocorrect="off" |
| autocapitalize="off" |
| spellcheck="false"/> |
| <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span> |
| </form></div> |
| </div> |
| |
| <nav class="bd-header navbar navbar-expand-lg bd-navbar"> |
| <div class="bd-header__inner bd-page-width"> |
| <label class="sidebar-toggle primary-toggle" for="__primary"> |
| <span class="fa-solid fa-bars"></span> |
| </label> |
| |
| <div class="navbar-header-items__start"> |
| |
| <div class="navbar-item"> |
| |
| |
| <a class="navbar-brand logo" href="../../../index.html"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <img src="../../../_static/spark-logo-light.png" class="logo__image only-light" alt="Logo image"/> |
| <script>document.write(`<img src="../../../_static/spark-logo-dark.png" class="logo__image only-dark" alt="Logo image"/>`);</script> |
| |
| |
| </a></div> |
| |
| </div> |
| |
| |
| <div class="col-lg-9 navbar-header-items"> |
| |
| <div class="me-auto navbar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| <div class="navbar-header-items__end"> |
| |
| <div class="navbar-item navbar-persistent--container"> |
| |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview1 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "_modules/pyspark/pandas/groupby.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="navbar-persistent--mobile"> |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| |
| </div> |
| |
| </nav> |
| |
| <div class="bd-container"> |
| <div class="bd-container__inner bd-page-width"> |
| |
| <div class="bd-sidebar-primary bd-sidebar hide-on-wide"> |
| |
| |
| |
| <div class="sidebar-header-items sidebar-primary__section"> |
| |
| |
| <div class="sidebar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| |
| <div class="sidebar-header-items__end"> |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview1 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "_modules/pyspark/pandas/groupby.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="sidebar-primary-items__end sidebar-primary__section"> |
| </div> |
| |
| <div id="rtd-footer-container"></div> |
| |
| |
| </div> |
| |
| <main id="main-content" class="bd-main"> |
| |
| |
| <div class="bd-content"> |
| <div class="bd-article-container"> |
| |
| <div class="bd-header-article"> |
| <div class="header-article-items header-article__inner"> |
| |
| <div class="header-article-items__start"> |
| |
| <div class="header-article-item"> |
| |
| |
| |
| <nav aria-label="Breadcrumbs"> |
| <ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb"> |
| |
| <li class="breadcrumb-item breadcrumb-home"> |
| <a href="../../../index.html" class="nav-link" aria-label="Home"> |
| <i class="fa-solid fa-home"></i> |
| </a> |
| </li> |
| |
| <li class="breadcrumb-item"><a href="../../index.html" class="nav-link">Module code</a></li> |
| |
| <li class="breadcrumb-item active" aria-current="page">pyspark.pandas.groupby</li> |
| </ul> |
| </nav> |
| </div> |
| |
| </div> |
| |
| |
| </div> |
| </div> |
| |
| |
| |
| |
| <div id="searchbox"></div> |
| <article class="bd-article" role="main"> |
| |
| <h1>Source code for pyspark.pandas.groupby</h1><div class="highlight"><pre> |
| <span></span><span class="c1">#</span> |
| <span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span> |
| <span class="c1"># contributor license agreements. See the NOTICE file distributed with</span> |
| <span class="c1"># this work for additional information regarding copyright ownership.</span> |
| <span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span> |
| <span class="c1"># (the "License"); you may not use this file except in compliance with</span> |
| <span class="c1"># the License. You may obtain a copy of the License at</span> |
| <span class="c1">#</span> |
| <span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span> |
| <span class="c1">#</span> |
| <span class="c1"># Unless required by applicable law or agreed to in writing, software</span> |
| <span class="c1"># distributed under the License is distributed on an "AS IS" BASIS,</span> |
| <span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span> |
| <span class="c1"># See the License for the specific language governing permissions and</span> |
| <span class="c1"># limitations under the License.</span> |
| <span class="c1">#</span> |
| |
| <span class="sd">"""</span> |
| <span class="sd">A wrapper for GroupedData to behave like pandas GroupBy.</span> |
| <span class="sd">"""</span> |
| <span class="kn">from</span> <span class="nn">abc</span> <span class="kn">import</span> <span class="n">ABCMeta</span><span class="p">,</span> <span class="n">abstractmethod</span> |
| <span class="kn">import</span> <span class="nn">inspect</span> |
| <span class="kn">from</span> <span class="nn">collections</span> <span class="kn">import</span> <span class="n">defaultdict</span><span class="p">,</span> <span class="n">namedtuple</span> |
| <span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">partial</span> |
| <span class="kn">from</span> <span class="nn">itertools</span> <span class="kn">import</span> <span class="n">product</span> |
| <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">Any</span><span class="p">,</span> |
| <span class="n">Callable</span><span class="p">,</span> |
| <span class="n">Dict</span><span class="p">,</span> |
| <span class="n">Generic</span><span class="p">,</span> |
| <span class="n">Iterator</span><span class="p">,</span> |
| <span class="n">Mapping</span><span class="p">,</span> |
| <span class="n">List</span><span class="p">,</span> |
| <span class="n">Optional</span><span class="p">,</span> |
| <span class="n">Sequence</span><span class="p">,</span> |
| <span class="n">Set</span><span class="p">,</span> |
| <span class="n">Tuple</span><span class="p">,</span> |
| <span class="n">Type</span><span class="p">,</span> |
| <span class="n">Union</span><span class="p">,</span> |
| <span class="n">cast</span><span class="p">,</span> |
| <span class="n">TYPE_CHECKING</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">import</span> <span class="nn">warnings</span> |
| |
| <span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span> |
| <span class="kn">from</span> <span class="nn">pandas.api.types</span> <span class="kn">import</span> <span class="n">is_number</span><span class="p">,</span> <span class="n">is_hashable</span><span class="p">,</span> <span class="n">is_list_like</span> <span class="c1"># type: ignore[attr-defined]</span> |
| <span class="kn">from</span> <span class="nn">pandas.core.common</span> <span class="kn">import</span> <span class="n">_builtin_table</span> <span class="c1"># type: ignore[attr-defined]</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Column</span><span class="p">,</span> <span class="n">DataFrame</span> <span class="k">as</span> <span class="n">SparkDataFrame</span><span class="p">,</span> <span class="n">Window</span><span class="p">,</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">F</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">BooleanType</span><span class="p">,</span> |
| <span class="n">DataType</span><span class="p">,</span> |
| <span class="n">DoubleType</span><span class="p">,</span> |
| <span class="n">NumericType</span><span class="p">,</span> |
| <span class="n">StructField</span><span class="p">,</span> |
| <span class="n">StructType</span><span class="p">,</span> |
| <span class="n">StringType</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">pandas</span> <span class="k">as</span> <span class="n">ps</span> <span class="c1"># For running doctests and reference resolution in PyCharm.</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas._typing</span> <span class="kn">import</span> <span class="n">Axis</span><span class="p">,</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">Label</span><span class="p">,</span> <span class="n">Name</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.typedef</span> <span class="kn">import</span> <span class="n">infer_return_type</span><span class="p">,</span> <span class="n">DataFrameType</span><span class="p">,</span> <span class="n">ScalarType</span><span class="p">,</span> <span class="n">SeriesType</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.frame</span> <span class="kn">import</span> <span class="n">DataFrame</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.internal</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">InternalField</span><span class="p">,</span> |
| <span class="n">InternalFrame</span><span class="p">,</span> |
| <span class="n">HIDDEN_COLUMNS</span><span class="p">,</span> |
| <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span> |
| <span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">,</span> |
| <span class="n">SPARK_DEFAULT_SERIES_NAME</span><span class="p">,</span> |
| <span class="n">SPARK_INDEX_NAME_PATTERN</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.missing.groupby</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">MissingPandasLikeDataFrameGroupBy</span><span class="p">,</span> |
| <span class="n">MissingPandasLikeSeriesGroupBy</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span><span class="p">,</span> <span class="n">first_series</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.spark</span> <span class="kn">import</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">SF</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.config</span> <span class="kn">import</span> <span class="n">get_option</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.correlation</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">compute</span><span class="p">,</span> |
| <span class="n">CORRELATION_VALUE_1_COLUMN</span><span class="p">,</span> |
| <span class="n">CORRELATION_VALUE_2_COLUMN</span><span class="p">,</span> |
| <span class="n">CORRELATION_CORR_OUTPUT_COLUMN</span><span class="p">,</span> |
| <span class="n">CORRELATION_COUNT_OUTPUT_COLUMN</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.utils</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">align_diff_frames</span><span class="p">,</span> |
| <span class="n">is_name_like_tuple</span><span class="p">,</span> |
| <span class="n">is_name_like_value</span><span class="p">,</span> |
| <span class="n">name_like_string</span><span class="p">,</span> |
| <span class="n">same_anchor</span><span class="p">,</span> |
| <span class="n">scol_for</span><span class="p">,</span> |
| <span class="n">verify_temp_column_name</span><span class="p">,</span> |
| <span class="n">log_advice</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.spark.utils</span> <span class="kn">import</span> <span class="n">as_nullable_spark_type</span><span class="p">,</span> <span class="n">force_decimal_precision_scale</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.exceptions</span> <span class="kn">import</span> <span class="n">DataError</span> |
| |
| <span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">RollingGroupby</span><span class="p">,</span> <span class="n">ExpandingGroupby</span><span class="p">,</span> <span class="n">ExponentialMovingGroupby</span> |
| |
| |
| <span class="c1"># to keep it the same as pandas</span> |
| <span class="n">NamedAgg</span> <span class="o">=</span> <span class="n">namedtuple</span><span class="p">(</span><span class="s2">"NamedAgg"</span><span class="p">,</span> <span class="p">[</span><span class="s2">"column"</span><span class="p">,</span> <span class="s2">"aggfunc"</span><span class="p">])</span> |
| |
| |
| <span class="k">class</span> <span class="nc">GroupBy</span><span class="p">(</span><span class="n">Generic</span><span class="p">[</span><span class="n">FrameLike</span><span class="p">],</span> <span class="n">metaclass</span><span class="o">=</span><span class="n">ABCMeta</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> :ivar _psdf: The parent dataframe that is used to perform the groupby</span> |
| <span class="sd"> :type _psdf: DataFrame</span> |
| <span class="sd"> :ivar _groupkeys: The list of keys that will be used to perform the grouping</span> |
| <span class="sd"> :type _groupkeys: List[Series]</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> |
| <span class="n">groupkeys</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span> |
| <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> |
| <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> |
| <span class="n">column_labels_to_exclude</span><span class="p">:</span> <span class="n">Set</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span> |
| <span class="n">agg_columns_selected</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> |
| <span class="n">agg_columns</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span> |
| <span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span> <span class="o">=</span> <span class="n">psdf</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span> <span class="o">=</span> <span class="n">groupkeys</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span> <span class="o">=</span> <span class="n">as_index</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span> <span class="o">=</span> <span class="n">dropna</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_column_labels_to_exclude</span> <span class="o">=</span> <span class="n">column_labels_to_exclude</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span> <span class="o">=</span> <span class="n">agg_columns_selected</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> <span class="o">=</span> <span class="n">agg_columns</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">_groupkeys_scols</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">]:</span> |
| <span class="k">return</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">]</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">_agg_columns_scols</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">]:</span> |
| <span class="k">return</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">]</span> |
| |
| <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="nf">_apply_series_op</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">op</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">"SeriesGroupBy"</span><span class="p">],</span> <span class="n">Series</span><span class="p">],</span> |
| <span class="n">should_resolve</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="k">pass</span> |
| |
| <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="nf">_handle_output</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">agg_column_names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="k">pass</span> |
| |
| <span class="c1"># TODO: Series support is not implemented yet.</span> |
| <span class="c1"># TODO: not all arguments are implemented comparing to pandas' for now.</span> |
| <span class="k">def</span> <span class="nf">aggregate</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">func_or_funcs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Aggregate using one or more operations over the specified axis.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> func_or_funcs : dict, str or list</span> |
| <span class="sd"> a dict mapping from column name (string) to</span> |
| <span class="sd"> aggregate functions (string or list of strings).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| |
| <span class="sd"> The return can be:</span> |
| |
| <span class="sd"> * Series : when DataFrame.agg is called with a single function</span> |
| <span class="sd"> * DataFrame : when DataFrame.agg is called with several functions</span> |
| |
| <span class="sd"> Return Series or DataFrame.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> `agg` is an alias for `aggregate`. Use the alias.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 1, 2, 2],</span> |
| <span class="sd"> ... 'B': [1, 2, 3, 4],</span> |
| <span class="sd"> ... 'C': [0.362, 0.227, 1.267, -0.562]},</span> |
| <span class="sd"> ... columns=['A', 'B', 'C'])</span> |
| |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 0 1 1 0.362</span> |
| <span class="sd"> 1 1 2 0.227</span> |
| <span class="sd"> 2 2 3 1.267</span> |
| <span class="sd"> 3 2 4 -0.562</span> |
| |
| <span class="sd"> Different aggregations per column</span> |
| |
| <span class="sd"> >>> aggregated = df.groupby('A').agg({'B': 'min', 'C': 'sum'})</span> |
| <span class="sd"> >>> aggregated[['B', 'C']].sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 1 0.589</span> |
| <span class="sd"> 2 3 0.705</span> |
| |
| <span class="sd"> >>> aggregated = df.groupby('A').agg({'B': ['min', 'max']})</span> |
| <span class="sd"> >>> aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B</span> |
| <span class="sd"> min max</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 1 2</span> |
| <span class="sd"> 2 3 4</span> |
| |
| <span class="sd"> >>> aggregated = df.groupby('A').agg('min')</span> |
| <span class="sd"> >>> aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 1 0.227</span> |
| <span class="sd"> 2 3 -0.562</span> |
| |
| <span class="sd"> >>> aggregated = df.groupby('A').agg(['min', 'max'])</span> |
| <span class="sd"> >>> aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> min max min max</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 1 2 0.227 0.362</span> |
| <span class="sd"> 2 3 4 -0.562 1.267</span> |
| |
| <span class="sd"> To control the output names with different aggregations per column, pandas-on-Spark</span> |
| <span class="sd"> also supports 'named aggregation' or nested renaming in .agg. It can also be</span> |
| <span class="sd"> used when applying multiple aggregation functions to specific columns.</span> |
| |
| <span class="sd"> >>> aggregated = df.groupby('A').agg(b_max=ps.NamedAgg(column='B', aggfunc='max'))</span> |
| <span class="sd"> >>> aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> b_max</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 4</span> |
| |
| <span class="sd"> >>> aggregated = df.groupby('A').agg(b_max=('B', 'max'), b_min=('B', 'min'))</span> |
| <span class="sd"> >>> aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> b_max b_min</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 2 1</span> |
| <span class="sd"> 2 4 3</span> |
| |
| <span class="sd"> >>> aggregated = df.groupby('A').agg(b_max=('B', 'max'), c_min=('C', 'min'))</span> |
| <span class="sd"> >>> aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> b_max c_min</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 2 0.227</span> |
| <span class="sd"> 2 4 -0.562</span> |
| <span class="sd"> """</span> |
| <span class="c1"># I think current implementation of func and arguments in pandas-on-Spark for aggregate</span> |
| <span class="c1"># is different than pandas, later once arguments are added, this could be removed.</span> |
| <span class="k">if</span> <span class="n">func_or_funcs</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">kwargs</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"No aggregation argument or function specified."</span><span class="p">)</span> |
| |
| <span class="n">relabeling</span> <span class="o">=</span> <span class="n">func_or_funcs</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">is_multi_agg_with_relabel</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">relabeling</span><span class="p">:</span> |
| <span class="p">(</span> |
| <span class="n">func_or_funcs</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">,</span> |
| <span class="n">order</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">=</span> <span class="n">normalize_keyword_aggregation</span><span class="p">(</span> <span class="c1"># type: ignore[assignment]</span> |
| <span class="n">kwargs</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">func_or_funcs</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="nb">list</span><span class="p">)):</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">func_or_funcs</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">or</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span> |
| <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> |
| <span class="ow">and</span> <span class="p">(</span> |
| <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> |
| <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span> |
| <span class="ow">and</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">value</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">for</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">func_or_funcs</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> |
| <span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"aggs must be a dict mapping from column name "</span> |
| <span class="s2">"to aggregate functions (string or list of strings)."</span> |
| <span class="p">)</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">agg_cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">col</span><span class="o">.</span><span class="n">name</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">]</span> |
| <span class="n">func_or_funcs</span> <span class="o">=</span> <span class="p">{</span><span class="n">col</span><span class="p">:</span> <span class="n">func_or_funcs</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">agg_cols</span><span class="p">}</span> |
| |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_groupby</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span> <span class="n">func_or_funcs</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span><span class="p">:</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span> |
| <span class="n">subset</span><span class="o">=</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span><span class="p">:</span> |
| <span class="n">index_cols</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="n">should_drop_index</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span> |
| <span class="n">i</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">gkey</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span> <span class="k">if</span> <span class="n">gkey</span><span class="o">.</span><span class="n">_psdf</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">should_drop_index</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">drop</span> <span class="o">=</span> <span class="ow">not</span> <span class="nb">any</span><span class="p">(</span> |
| <span class="p">[</span> |
| <span class="nb">isinstance</span><span class="p">(</span><span class="n">func_or_funcs</span><span class="p">[</span><span class="n">gkey</span><span class="o">.</span><span class="n">name</span><span class="p">],</span> <span class="nb">list</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">gkey</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span> |
| <span class="k">if</span> <span class="n">gkey</span><span class="o">.</span><span class="n">name</span> <span class="ow">in</span> <span class="n">func_or_funcs</span> |
| <span class="p">]</span> |
| <span class="p">)</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">level</span><span class="o">=</span><span class="n">should_drop_index</span><span class="p">,</span> <span class="n">drop</span><span class="o">=</span><span class="n">drop</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">should_drop_index</span><span class="p">)</span> <span class="o"><</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">):</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span> |
| <span class="n">index_cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">c</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> <span class="k">if</span> <span class="n">c</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">index_cols</span><span class="p">]</span> |
| <span class="k">if</span> <span class="n">relabeling</span><span class="p">:</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">(</span><span class="n">index_cols</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="n">order</span><span class="p">))]</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">([</span><span class="n">c</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">index_cols</span><span class="p">]</span> <span class="o">+</span> <span class="nb">list</span><span class="p">(</span><span class="n">columns</span><span class="p">))</span> |
| |
| <span class="k">if</span> <span class="n">relabeling</span> <span class="ow">and</span> <span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span><span class="p">:</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">order</span><span class="p">]</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">columns</span> <span class="c1"># type: ignore[assignment]</span> |
| <span class="k">return</span> <span class="n">psdf</span> |
| |
| <span class="n">agg</span> <span class="o">=</span> <span class="n">aggregate</span> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_spark_groupby</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> |
| <span class="n">func</span><span class="p">:</span> <span class="n">Mapping</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]],</span> |
| <span class="n">groupkeys</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Series</span><span class="p">]</span> <span class="o">=</span> <span class="p">(),</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">InternalFrame</span><span class="p">:</span> |
| <span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">))]</span> |
| <span class="n">groupkey_scols</span> <span class="o">=</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)]</span> |
| |
| <span class="n">multi_aggs</span> <span class="o">=</span> <span class="nb">any</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">func</span><span class="o">.</span><span class="n">values</span><span class="p">())</span> |
| <span class="n">reordered</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">data_columns</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">func</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> |
| <span class="n">label</span> <span class="o">=</span> <span class="n">key</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">key</span><span class="p">,)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">!=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"The length of the key must be the same as the column label level."</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">aggfunc</span> <span class="ow">in</span> <span class="p">[</span><span class="n">value</span><span class="p">]</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="n">value</span><span class="p">:</span> |
| <span class="n">column_label</span> <span class="o">=</span> <span class="nb">tuple</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">+</span> <span class="p">[</span><span class="n">aggfunc</span><span class="p">])</span> <span class="k">if</span> <span class="n">multi_aggs</span> <span class="k">else</span> <span class="n">label</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">column_label</span><span class="p">)</span> |
| |
| <span class="n">data_col</span> <span class="o">=</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">column_label</span><span class="p">)</span> |
| <span class="n">data_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">data_col</span><span class="p">)</span> |
| |
| <span class="n">col_name</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">aggfunc</span> <span class="o">==</span> <span class="s2">"nunique"</span><span class="p">:</span> |
| <span class="n">reordered</span><span class="o">.</span><span class="n">append</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">expr</span><span class="p">(</span><span class="s2">"count(DISTINCT `</span><span class="si">{0}</span><span class="s2">`) as `</span><span class="si">{1}</span><span class="s2">`"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">col_name</span><span class="p">,</span> <span class="n">data_col</span><span class="p">))</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># Implement "quartiles" aggregate function for ``describe``.</span> |
| <span class="k">elif</span> <span class="n">aggfunc</span> <span class="o">==</span> <span class="s2">"quartiles"</span><span class="p">:</span> |
| <span class="n">reordered</span><span class="o">.</span><span class="n">append</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">expr</span><span class="p">(</span> |
| <span class="s2">"percentile_approx(`</span><span class="si">{0}</span><span class="s2">`, array(0.25, 0.5, 0.75)) as `</span><span class="si">{1}</span><span class="s2">`"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">col_name</span><span class="p">,</span> <span class="n">data_col</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">reordered</span><span class="o">.</span><span class="n">append</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">expr</span><span class="p">(</span><span class="s2">"</span><span class="si">{1}</span><span class="s2">(`</span><span class="si">{0}</span><span class="s2">`) as `</span><span class="si">{2}</span><span class="s2">`"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">col_name</span><span class="p">,</span> <span class="n">aggfunc</span><span class="p">,</span> <span class="n">data_col</span><span class="p">))</span> |
| <span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">groupkey_scols</span> <span class="o">+</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">reordered</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">groupkeys</span><span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">,</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_columns</span><span class="p">],</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="GroupBy.count"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.count.html#pyspark.pandas.groupby.GroupBy.count">[docs]</a> <span class="k">def</span> <span class="nf">count</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Compute count of group, excluding missing values.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 1, 2, 1, 2],</span> |
| <span class="sd"> ... 'B': [np.nan, 2, 3, 4, 5],</span> |
| <span class="sd"> ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C'])</span> |
| <span class="sd"> >>> df.groupby('A').count().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 2 3</span> |
| <span class="sd"> 2 2 2</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.first"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.first.html#pyspark.pandas.groupby.GroupBy.first">[docs]</a> <span class="k">def</span> <span class="nf">first</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">min_count</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Compute first of group values.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> numeric_only : bool, default False</span> |
| <span class="sd"> Include only float, int, boolean columns. If None, will attempt to use</span> |
| <span class="sd"> everything, then use only numeric data.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| <span class="sd"> min_count : int, default -1</span> |
| <span class="sd"> The required number of valid values to perform the operation. If fewer</span> |
| <span class="sd"> than ``min_count`` non-NA values are present the result will be NA.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({"A": [1, 2, 1, 2], "B": [True, False, False, True],</span> |
| <span class="sd"> ... "C": [3, 3, 4, 4], "D": ["a", "b", "a", "a"]})</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C D</span> |
| <span class="sd"> 0 1 True 3 a</span> |
| <span class="sd"> 1 2 False 3 b</span> |
| <span class="sd"> 2 1 False 4 a</span> |
| <span class="sd"> 3 2 True 4 a</span> |
| |
| <span class="sd"> >>> df.groupby("A").first().sort_index()</span> |
| <span class="sd"> B C D</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 True 3 a</span> |
| <span class="sd"> 2 False 3 b</span> |
| |
| <span class="sd"> Include only float, int, boolean columns when set numeric_only True.</span> |
| |
| <span class="sd"> >>> df.groupby("A").first(numeric_only=True).sort_index()</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 True 3</span> |
| <span class="sd"> 2 False 3</span> |
| |
| <span class="sd"> >>> df.groupby("D").first().sort_index()</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> D</span> |
| <span class="sd"> a 1 True 3</span> |
| <span class="sd"> b 2 False 3</span> |
| |
| <span class="sd"> >>> df.groupby("D").first(min_count=3).sort_index()</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> D</span> |
| <span class="sd"> a 1.0 True 3.0</span> |
| <span class="sd"> b NaN None NaN</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">min_count</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"min_count must be integer"</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">first</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">ignorenulls</span><span class="o">=</span><span class="kc">True</span><span class="p">),</span> |
| <span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">)</span> <span class="k">if</span> <span class="n">numeric_only</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">min_count</span><span class="o">=</span><span class="n">min_count</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.last"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.last.html#pyspark.pandas.groupby.GroupBy.last">[docs]</a> <span class="k">def</span> <span class="nf">last</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">min_count</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Compute last of group values.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> numeric_only : bool, default False</span> |
| <span class="sd"> Include only float, int, boolean columns. If None, will attempt to use</span> |
| <span class="sd"> everything, then use only numeric data.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| <span class="sd"> min_count : int, default -1</span> |
| <span class="sd"> The required number of valid values to perform the operation. If fewer</span> |
| <span class="sd"> than ``min_count`` non-NA values are present the result will be NA.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({"A": [1, 2, 1, 2], "B": [True, False, False, True],</span> |
| <span class="sd"> ... "C": [3, 3, 4, 4], "D": ["a", "a", "b", "a"]})</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C D</span> |
| <span class="sd"> 0 1 True 3 a</span> |
| <span class="sd"> 1 2 False 3 a</span> |
| <span class="sd"> 2 1 False 4 b</span> |
| <span class="sd"> 3 2 True 4 a</span> |
| |
| <span class="sd"> >>> df.groupby("A").last().sort_index()</span> |
| <span class="sd"> B C D</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 False 4 b</span> |
| <span class="sd"> 2 True 4 a</span> |
| |
| <span class="sd"> Include only float, int, boolean columns when set numeric_only True.</span> |
| |
| <span class="sd"> >>> df.groupby("A").last(numeric_only=True).sort_index()</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 False 4</span> |
| <span class="sd"> 2 True 4</span> |
| |
| <span class="sd"> >>> df.groupby("D").last().sort_index()</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> D</span> |
| <span class="sd"> a 2 True 4</span> |
| <span class="sd"> b 1 False 4</span> |
| |
| <span class="sd"> >>> df.groupby("D").last(min_count=3).sort_index()</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> D</span> |
| <span class="sd"> a 2.0 True 4.0</span> |
| <span class="sd"> b NaN None NaN</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">min_count</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"min_count must be integer"</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">last</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">ignorenulls</span><span class="o">=</span><span class="kc">True</span><span class="p">),</span> |
| <span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">)</span> <span class="k">if</span> <span class="n">numeric_only</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">min_count</span><span class="o">=</span><span class="n">min_count</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.max"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.max.html#pyspark.pandas.groupby.GroupBy.max">[docs]</a> <span class="k">def</span> <span class="nf">max</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">min_count</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Compute max of group values.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> numeric_only : bool, default False</span> |
| <span class="sd"> Include only float, int, boolean columns. If None, will attempt to use</span> |
| <span class="sd"> everything, then use only numeric data.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| <span class="sd"> min_count : bool, default -1</span> |
| <span class="sd"> The required number of valid values to perform the operation. If fewer</span> |
| <span class="sd"> than min_count non-NA values are present the result will be NA.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({"A": [1, 2, 1, 2], "B": [True, False, False, True],</span> |
| <span class="sd"> ... "C": [3, 4, 3, 4], "D": ["a", "a", "b", "a"]})</span> |
| |
| <span class="sd"> >>> df.groupby("A").max().sort_index()</span> |
| <span class="sd"> B C D</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 True 3 b</span> |
| <span class="sd"> 2 True 4 a</span> |
| |
| <span class="sd"> Include only float, int, boolean columns when set numeric_only True.</span> |
| |
| <span class="sd"> >>> df.groupby("A").max(numeric_only=True).sort_index()</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 True 3</span> |
| <span class="sd"> 2 True 4</span> |
| |
| <span class="sd"> >>> df.groupby("D").max().sort_index()</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> D</span> |
| <span class="sd"> a 2 True 4</span> |
| <span class="sd"> b 1 False 3</span> |
| |
| <span class="sd"> >>> df.groupby("D").max(min_count=3).sort_index()</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> D</span> |
| <span class="sd"> a 2.0 True 4.0</span> |
| <span class="sd"> b NaN None NaN</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">min_count</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"min_count must be integer"</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">,</span> |
| <span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">)</span> <span class="k">if</span> <span class="n">numeric_only</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">min_count</span><span class="o">=</span><span class="n">min_count</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.mean"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.mean.html#pyspark.pandas.groupby.GroupBy.mean">[docs]</a> <span class="k">def</span> <span class="nf">mean</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Compute mean of groups, excluding missing values.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> numeric_only : bool, default False</span> |
| <span class="sd"> Include only float, int, boolean columns.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| <span class="sd"> .. versionchanged:: 4.0.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> pyspark.pandas.Series or pyspark.pandas.DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 1, 2, 1, 2],</span> |
| <span class="sd"> ... 'B': [np.nan, 2, 3, 4, 5],</span> |
| <span class="sd"> ... 'C': [1, 2, 1, 1, 2],</span> |
| <span class="sd"> ... 'D': [True, False, True, False, True]})</span> |
| |
| <span class="sd"> Groupby one column and return the mean of the remaining columns in</span> |
| <span class="sd"> each group.</span> |
| |
| <span class="sd"> >>> df.groupby('A').mean().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B C D</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 3.0 1.333333 0.333333</span> |
| <span class="sd"> 2 4.0 1.500000 1.000000</span> |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_validate_agg_columns</span><span class="p">(</span><span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="n">function_name</span><span class="o">=</span><span class="s2">"median"</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">mean</span><span class="p">,</span> <span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,),</span> <span class="n">bool_to_numeric</span><span class="o">=</span><span class="kc">True</span> |
| <span class="p">)</span></div> |
| |
| <span class="c1"># TODO: 'q' accepts list like type</span> |
| <div class="viewcode-block" id="GroupBy.quantile"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.quantile.html#pyspark.pandas.groupby.GroupBy.quantile">[docs]</a> <span class="k">def</span> <span class="nf">quantile</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">q</span><span class="p">:</span> <span class="nb">float</span> <span class="o">=</span> <span class="mf">0.5</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10000</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return group values at the given quantile.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> q : float, default 0.5 (50% quantile)</span> |
| <span class="sd"> Value between 0 and 1 providing the quantile to compute.</span> |
| <span class="sd"> accuracy : int, optional</span> |
| <span class="sd"> Default accuracy of approximation. Larger value means better accuracy.</span> |
| <span class="sd"> The relative error can be deduced by 1.0 / accuracy.</span> |
| <span class="sd"> This is a panda-on-Spark specific parameter.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> pyspark.pandas.Series or pyspark.pandas.DataFrame</span> |
| <span class="sd"> Return type determined by caller of GroupBy object.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> `quantile` in pandas-on-Spark are using distributed percentile approximation</span> |
| <span class="sd"> algorithm unlike pandas, the result might be different with pandas, also</span> |
| <span class="sd"> `interpolation` parameter is not supported yet.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.quantile</span> |
| <span class="sd"> pyspark.pandas.DataFrame.quantile</span> |
| <span class="sd"> pyspark.sql.functions.percentile_approx</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame([</span> |
| <span class="sd"> ... ['a', 1], ['a', 2], ['a', 3],</span> |
| <span class="sd"> ... ['b', 1], ['b', 3], ['b', 5]</span> |
| <span class="sd"> ... ], columns=['key', 'val'])</span> |
| |
| <span class="sd"> Groupby one column and return the quantile of the remaining columns in</span> |
| <span class="sd"> each group.</span> |
| |
| <span class="sd"> >>> df.groupby('key').quantile()</span> |
| <span class="sd"> val</span> |
| <span class="sd"> key</span> |
| <span class="sd"> a 2.0</span> |
| <span class="sd"> b 3.0</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">q</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"q doesn't support for list like type for now"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_number</span><span class="p">(</span><span class="n">q</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"must be real number, not </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">q</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="mi">0</span> <span class="o"><=</span> <span class="n">q</span> <span class="o"><=</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"'q' must be between 0 and 1. Got '</span><span class="si">%s</span><span class="s2">' instead"</span> <span class="o">%</span> <span class="n">q</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">_agg_col</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">)</span> <span class="k">for</span> <span class="n">_agg_col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">):</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span> |
| <span class="sa">f</span><span class="s2">"Allowing bool dtype in </span><span class="si">{</span><span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="o">.</span><span class="vm">__name__</span><span class="si">}</span><span class="s2">.quantile is deprecated "</span> |
| <span class="s2">"and will raise in a future version, matching the Series/DataFrame behavior. "</span> |
| <span class="s2">"Cast to uint8 dtype before calling quantile instead."</span><span class="p">,</span> |
| <span class="ne">FutureWarning</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">percentile_approx</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="n">DoubleType</span><span class="p">()),</span> <span class="n">q</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">),</span> |
| <span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">),</span> |
| <span class="n">bool_to_numeric</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.min"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.min.html#pyspark.pandas.groupby.GroupBy.min">[docs]</a> <span class="k">def</span> <span class="nf">min</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">min_count</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="o">-</span><span class="mi">1</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Compute min of group values.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> numeric_only : bool, default False</span> |
| <span class="sd"> Include only float, int, boolean columns. If None, will attempt to use</span> |
| <span class="sd"> everything, then use only numeric data.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| <span class="sd"> min_count : bool, default -1</span> |
| <span class="sd"> The required number of valid values to perform the operation. If fewer</span> |
| <span class="sd"> than min_count non-NA values are present the result will be NA.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({"A": [1, 2, 1, 2], "B": [True, False, False, True],</span> |
| <span class="sd"> ... "C": [3, 4, 3, 4], "D": ["a", "a", "b", "a"]})</span> |
| <span class="sd"> >>> df.groupby("A").min().sort_index()</span> |
| <span class="sd"> B C D</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 False 3 a</span> |
| <span class="sd"> 2 False 4 a</span> |
| |
| <span class="sd"> Include only float, int, boolean columns when set numeric_only True.</span> |
| |
| <span class="sd"> >>> df.groupby("A").min(numeric_only=True).sort_index()</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 False 3</span> |
| <span class="sd"> 2 False 4</span> |
| |
| <span class="sd"> >>> df.groupby("D").min().sort_index()</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> D</span> |
| <span class="sd"> a 1 False 3</span> |
| <span class="sd"> b 1 False 3</span> |
| |
| |
| <span class="sd"> >>> df.groupby("D").min(min_count=3).sort_index()</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> D</span> |
| <span class="sd"> a 1.0 False 3.0</span> |
| <span class="sd"> b NaN None NaN</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">min_count</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"min_count must be integer"</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">,</span> |
| <span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">)</span> <span class="k">if</span> <span class="n">numeric_only</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">min_count</span><span class="o">=</span><span class="n">min_count</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <span class="c1"># TODO: sync the doc.</span> |
| <div class="viewcode-block" id="GroupBy.std"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.std.html#pyspark.pandas.groupby.GroupBy.std">[docs]</a> <span class="k">def</span> <span class="nf">std</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ddof</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Compute standard deviation of groups, excluding missing values.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> ddof : int, default 1</span> |
| <span class="sd"> Delta Degrees of Freedom. The divisor used in calculations is N - ddof,</span> |
| <span class="sd"> where N represents the number of elements.</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supported including arbitary integers.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({"A": [1, 2, 1, 2], "B": [True, False, False, True],</span> |
| <span class="sd"> ... "C": [3, 4, 3, 4], "D": ["a", "b", "b", "a"]})</span> |
| |
| <span class="sd"> >>> df.groupby("A").std()</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 0.707107 0.0</span> |
| <span class="sd"> 2 0.707107 0.0</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ddof</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"ddof must be integer"</span><span class="p">)</span> |
| |
| <span class="c1"># Raise the TypeError when all aggregation columns are of unaccepted data types</span> |
| <span class="n">any_accepted</span> <span class="o">=</span> <span class="nb">any</span><span class="p">(</span> |
| <span class="nb">isinstance</span><span class="p">(</span><span class="n">_agg_col</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">_agg_col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">any_accepted</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Unaccepted data types of aggregation columns; numeric or bool expected."</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">std</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">SF</span><span class="o">.</span><span class="n">stddev</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">ddof</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">std</span><span class="p">,</span> |
| <span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,),</span> |
| <span class="n">bool_to_numeric</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.sum"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.sum.html#pyspark.pandas.groupby.GroupBy.sum">[docs]</a> <span class="k">def</span> <span class="nf">sum</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">min_count</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Compute sum of group values</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> numeric_only : bool, default False</span> |
| <span class="sd"> Include only float, int, boolean columns.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| <span class="sd"> .. versionchanged:: 4.0.0</span> |
| <span class="sd"> min_count : int, default 0</span> |
| <span class="sd"> The required number of valid values to perform the operation.</span> |
| <span class="sd"> If fewer than min_count non-NA values are present the result will be NA.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({"A": [1, 2, 1, 2], "B": [True, False, False, True],</span> |
| <span class="sd"> ... "C": [3, 4, 3, 4], "D": ["a", "a", "b", "a"]})</span> |
| |
| <span class="sd"> >>> df.groupby("A").sum().sort_index()</span> |
| <span class="sd"> B C D</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 1 6 ab</span> |
| <span class="sd"> 2 1 8 aa</span> |
| |
| <span class="sd"> >>> df.groupby("D").sum().sort_index()</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> D</span> |
| <span class="sd"> a 5 2 11</span> |
| <span class="sd"> b 1 0 3</span> |
| |
| <span class="sd"> >>> df.groupby("D").sum(min_count=3).sort_index()</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> D</span> |
| <span class="sd"> a 5.0 2.0 11.0</span> |
| <span class="sd"> b NaN NaN NaN</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> There is a behavior difference between pandas-on-Spark and pandas:</span> |
| |
| <span class="sd"> * when there is a non-numeric aggregation column, it will be ignored</span> |
| <span class="sd"> even if `numeric_only` is False.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">numeric_only</span><span class="p">,</span> <span class="nb">bool</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"numeric_only must be None or bool"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">min_count</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"min_count must be integer"</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">numeric_only</span><span class="p">:</span> |
| <span class="n">unsupported</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">col</span><span class="o">.</span><span class="n">name</span> |
| <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">,</span> <span class="n">StringType</span><span class="p">))</span> |
| <span class="p">]</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">unsupported</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">log_advice</span><span class="p">(</span> |
| <span class="s2">"GroupBy.sum() can only support numeric, bool and string columns even if"</span> |
| <span class="sa">f</span><span class="s2">"numeric_only=False, skip unsupported columns: </span><span class="si">{</span><span class="n">unsupported</span><span class="si">}</span><span class="s2">"</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">sum</span><span class="p">,</span> |
| <span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">,</span> <span class="n">StringType</span><span class="p">),</span> |
| <span class="n">bool_to_numeric</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="n">min_count</span><span class="o">=</span><span class="n">min_count</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <span class="c1"># TODO: sync the doc.</span> |
| <div class="viewcode-block" id="GroupBy.var"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.var.html#pyspark.pandas.groupby.GroupBy.var">[docs]</a> <span class="k">def</span> <span class="nf">var</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ddof</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Compute variance of groups, excluding missing values.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> ddof : int, default 1</span> |
| <span class="sd"> Delta Degrees of Freedom. The divisor used in calculations is N - ddof,</span> |
| <span class="sd"> where N represents the number of elements.</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supported including arbitary integers.</span> |
| |
| <span class="sd"> numeric_only : bool, default False</span> |
| <span class="sd"> Include only float, int, boolean columns.</span> |
| |
| <span class="sd"> .. versionadded:: 4.0.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({"A": [1, 2, 1, 2], "B": [True, False, False, True],</span> |
| <span class="sd"> ... "C": [3, 4, 3, 4], "D": ["a", "b", "b", "a"]})</span> |
| |
| <span class="sd"> >>> df.groupby("A").var()</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 0.5 0.0</span> |
| <span class="sd"> 2 0.5 0.0</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ddof</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"ddof must be integer"</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">var</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">SF</span><span class="o">.</span><span class="n">var</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">ddof</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">var</span><span class="p">,</span> |
| <span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,),</span> |
| <span class="n">bool_to_numeric</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">skew</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Compute skewness of groups, excluding missing values.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({"A": [1, 2, 1, 1], "B": [True, False, False, True],</span> |
| <span class="sd"> ... "C": [3, 4, 3, 4], "D": ["a", "b", "b", "a"]})</span> |
| |
| <span class="sd"> >>> df.groupby("A").skew()</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 -1.732051 1.732051</span> |
| <span class="sd"> 2 NaN NaN</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">SF</span><span class="o">.</span><span class="n">skew</span><span class="p">,</span> |
| <span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,),</span> |
| <span class="n">bool_to_numeric</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="GroupBy.sem"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.sem.html#pyspark.pandas.groupby.GroupBy.sem">[docs]</a> <span class="k">def</span> <span class="nf">sem</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ddof</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Compute standard error of the mean of groups, excluding missing values.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> ddof : int, default 1</span> |
| <span class="sd"> Delta Degrees of Freedom. The divisor used in calculations is N - ddof,</span> |
| <span class="sd"> where N represents the number of elements.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({"A": [1, 2, 1, 1], "B": [True, False, False, True],</span> |
| <span class="sd"> ... "C": [3, None, 3, 4], "D": ["a", "b", "b", "a"]})</span> |
| |
| <span class="sd"> >>> df.groupby("A").sem()</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 0.333333 0.333333</span> |
| <span class="sd"> 2 NaN NaN</span> |
| |
| <span class="sd"> >>> df.groupby("D").sem(ddof=1)</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> D</span> |
| <span class="sd"> a 0.0 0.0 0.5</span> |
| <span class="sd"> b 0.5 0.0 NaN</span> |
| |
| <span class="sd"> >>> df.B.groupby(df.A).sem()</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 0.333333</span> |
| <span class="sd"> 2 NaN</span> |
| <span class="sd"> Name: B, dtype: float64</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.sem</span> |
| <span class="sd"> pyspark.pandas.DataFrame.sem</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">ddof</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"ddof must be integer"</span><span class="p">)</span> |
| |
| <span class="c1"># Raise the TypeError when all aggregation columns are of unaccepted data types</span> |
| <span class="n">any_accepted</span> <span class="o">=</span> <span class="nb">any</span><span class="p">(</span> |
| <span class="nb">isinstance</span><span class="p">(</span><span class="n">_agg_col</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">_agg_col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">any_accepted</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Unaccepted data types of aggregation columns; numeric or bool expected."</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">sem</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">SF</span><span class="o">.</span><span class="n">stddev</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">ddof</span><span class="p">)</span> <span class="o">/</span> <span class="n">F</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">col</span><span class="p">))</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">sem</span><span class="p">,</span> |
| <span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">),</span> |
| <span class="n">bool_to_numeric</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <span class="c1"># TODO: 1, 'n' accepts list and slice; 2, implement 'dropna' parameter</span> |
| <div class="viewcode-block" id="GroupBy.nth"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.nth.html#pyspark.pandas.groupby.GroupBy.nth">[docs]</a> <span class="k">def</span> <span class="nf">nth</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Take the nth row from each group.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> n : int</span> |
| <span class="sd"> A single nth value for the row</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> There is a behavior difference between pandas-on-Spark and pandas:</span> |
| |
| <span class="sd"> * when there is no aggregation column, and `n` not equal to 0 or -1,</span> |
| <span class="sd"> the returned empty dataframe may have an index with different lenght `__len__`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import numpy as np</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 1, 2, 1, 2],</span> |
| <span class="sd"> ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B'])</span> |
| <span class="sd"> >>> g = df.groupby('A')</span> |
| <span class="sd"> >>> g.nth(0)</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 NaN</span> |
| <span class="sd"> 2 2 3.0</span> |
| <span class="sd"> >>> g.nth(1)</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 1 1 2.0</span> |
| <span class="sd"> 4 2 5.0</span> |
| <span class="sd"> >>> g.nth(-1)</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 3 1 4.0</span> |
| <span class="sd"> 4 2 5.0</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="nb">slice</span><span class="p">)</span> <span class="ow">or</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">n</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span><span class="s2">"n doesn't support slice or list for now"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Invalid index </span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">n</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| |
| <span class="n">groupkey_names</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="nb">str</span><span class="p">(</span><span class="n">groupkey</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">groupkey</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">]</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">window1</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span> |
| <span class="n">tmp_row_number_col</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">"__tmp_row_number_col__"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">n</span> <span class="o">>=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span> |
| <span class="n">tmp_row_number_col</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window1</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">tmp_row_number_col</span><span class="p">)</span> <span class="o">==</span> <span class="n">n</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">tmp_row_number_col</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">window2</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">rowsBetween</span><span class="p">(</span> |
| <span class="n">Window</span><span class="o">.</span><span class="n">unboundedPreceding</span><span class="p">,</span> <span class="n">Window</span><span class="o">.</span><span class="n">unboundedFollowing</span> |
| <span class="p">)</span> |
| <span class="n">tmp_group_size_col</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">"__tmp_group_size_col__"</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span> |
| <span class="n">tmp_group_size_col</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window2</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">tmp_row_number_col</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window1</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">tmp_row_number_col</span><span class="p">)</span> <span class="o">==</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">tmp_group_size_col</span><span class="p">)</span> <span class="o">+</span> <span class="mi">1</span> <span class="o">+</span> <span class="n">n</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">tmp_group_size_col</span><span class="p">,</span> <span class="n">tmp_row_number_col</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">distinct</span><span class="p">()</span> |
| |
| <span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span><span class="p">:</span> |
| <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">:</span> |
| <span class="n">agg_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">:</span> |
| <span class="n">agg_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_column_names</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="n">internal</span><span class="o">.</span><span class="n">index_names</span><span class="p">,</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="n">internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">,</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span> |
| <span class="p">],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">],</span> |
| <span class="n">column_label_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">agg_column_names</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="p">[</span><span class="nb">str</span><span class="p">(</span><span class="n">agg_column</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">agg_column</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">]</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span> |
| <span class="k">else</span> <span class="kc">None</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prepare_return</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">),</span> <span class="n">agg_column_names</span><span class="o">=</span><span class="n">agg_column_names</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.prod"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.prod.html#pyspark.pandas.groupby.GroupBy.prod">[docs]</a> <span class="k">def</span> <span class="nf">prod</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">min_count</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">0</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Compute prod of groups.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> numeric_only : bool, default False</span> |
| <span class="sd"> Include only float, int, boolean columns.</span> |
| |
| <span class="sd"> .. versionchanged:: 4.0.0</span> |
| |
| <span class="sd"> min_count : int, default 0</span> |
| <span class="sd"> The required number of valid values to perform the operation.</span> |
| <span class="sd"> If fewer than min_count non-NA values are present the result will be NA.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| <span class="sd"> Computed prod of values within each group.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import numpy as np</span> |
| <span class="sd"> >>> df = ps.DataFrame(</span> |
| <span class="sd"> ... {</span> |
| <span class="sd"> ... "A": [1, 1, 2, 1, 2],</span> |
| <span class="sd"> ... "B": [np.nan, 2, 3, 4, 5],</span> |
| <span class="sd"> ... "C": [1, 2, 1, 1, 2],</span> |
| <span class="sd"> ... "D": [True, False, True, False, True],</span> |
| <span class="sd"> ... }</span> |
| <span class="sd"> ... )</span> |
| |
| <span class="sd"> Groupby one column and return the prod of the remaining columns in</span> |
| <span class="sd"> each group.</span> |
| |
| <span class="sd"> >>> df.groupby('A').prod().sort_index()</span> |
| <span class="sd"> B C D</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 8.0 2 0</span> |
| <span class="sd"> 2 15.0 2 1</span> |
| |
| <span class="sd"> >>> df.groupby('A').prod(min_count=3).sort_index()</span> |
| <span class="sd"> B C D</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 NaN 2.0 0.0</span> |
| <span class="sd"> 2 NaN NaN NaN</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">min_count</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"min_count must be integer"</span><span class="p">)</span> |
| |
| <span class="bp">self</span><span class="o">.</span><span class="n">_validate_agg_columns</span><span class="p">(</span><span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="n">function_name</span><span class="o">=</span><span class="s2">"prod"</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">SF</span><span class="o">.</span><span class="n">product</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="kc">True</span><span class="p">),</span> |
| <span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">),</span> |
| <span class="n">bool_to_numeric</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="n">min_count</span><span class="o">=</span><span class="n">min_count</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.all"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.all.html#pyspark.pandas.groupby.GroupBy.all">[docs]</a> <span class="k">def</span> <span class="nf">all</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns True if all values in the group are truthful, else False.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> skipna : bool, default True</span> |
| <span class="sd"> Flag to ignore NA(nan/null) values during truth testing.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],</span> |
| <span class="sd"> ... 'B': [True, True, True, False, False,</span> |
| <span class="sd"> ... False, None, True, None, False]},</span> |
| <span class="sd"> ... columns=['A', 'B'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 True</span> |
| <span class="sd"> 1 1 True</span> |
| <span class="sd"> 2 2 True</span> |
| <span class="sd"> 3 2 False</span> |
| <span class="sd"> 4 3 False</span> |
| <span class="sd"> 5 3 False</span> |
| <span class="sd"> 6 4 None</span> |
| <span class="sd"> 7 4 True</span> |
| <span class="sd"> 8 5 None</span> |
| <span class="sd"> 9 5 False</span> |
| |
| <span class="sd"> >>> df.groupby('A').all().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 True</span> |
| <span class="sd"> 2 False</span> |
| <span class="sd"> 3 False</span> |
| <span class="sd"> 4 True</span> |
| <span class="sd"> 5 False</span> |
| |
| <span class="sd"> >>> df.groupby('A').all(skipna=False).sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 True</span> |
| <span class="sd"> 2 False</span> |
| <span class="sd"> 3 False</span> |
| <span class="sd"> 4 False</span> |
| <span class="sd"> 5 False</span> |
| <span class="sd"> """</span> |
| <span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span> |
| <span class="n">internal</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prepare_reduce</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">sfun</span><span class="p">(</span><span class="n">scol</span><span class="p">:</span> <span class="n">Column</span><span class="p">,</span> <span class="n">scol_type</span><span class="p">:</span> <span class="n">DataType</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">scol_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">)</span> <span class="ow">or</span> <span class="n">skipna</span><span class="p">:</span> |
| <span class="c1"># np.nan takes no effect to the result; None takes no effect if `skipna`</span> |
| <span class="n">all_col</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">"boolean"</span><span class="p">),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">True</span><span class="p">)))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="c1"># Take None as False when not `skipna`</span> |
| <span class="n">all_col</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">))</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">scol</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">"boolean"</span><span class="p">)))</span> |
| <span class="k">return</span> <span class="n">all_col</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">stat_exprs</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="n">stat_exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span> |
| <span class="n">sfun</span><span class="p">(</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">nan_to_null</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> |
| <span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">stat_exprs</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">distinct</span><span class="p">()</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prepare_return</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <span class="c1"># TODO: skipna should be implemented.</span> |
| <div class="viewcode-block" id="GroupBy.any"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.any.html#pyspark.pandas.groupby.GroupBy.any">[docs]</a> <span class="k">def</span> <span class="nf">any</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns True if any value in the group is truthful, else False.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],</span> |
| <span class="sd"> ... 'B': [True, True, True, False, False,</span> |
| <span class="sd"> ... False, None, True, None, False]},</span> |
| <span class="sd"> ... columns=['A', 'B'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 True</span> |
| <span class="sd"> 1 1 True</span> |
| <span class="sd"> 2 2 True</span> |
| <span class="sd"> 3 2 False</span> |
| <span class="sd"> 4 3 False</span> |
| <span class="sd"> 5 3 False</span> |
| <span class="sd"> 6 4 None</span> |
| <span class="sd"> 7 4 True</span> |
| <span class="sd"> 8 5 None</span> |
| <span class="sd"> 9 5 False</span> |
| |
| <span class="sd"> >>> df.groupby('A').any().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 True</span> |
| <span class="sd"> 2 True</span> |
| <span class="sd"> 3 False</span> |
| <span class="sd"> 4 True</span> |
| <span class="sd"> 5 False</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">"boolean"</span><span class="p">),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">)))</span> |
| <span class="p">)</span></div> |
| |
| <span class="c1"># TODO: groupby multiply columns should be implemented.</span> |
| <div class="viewcode-block" id="GroupBy.size"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.size.html#pyspark.pandas.groupby.GroupBy.size">[docs]</a> <span class="k">def</span> <span class="nf">size</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Series</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Compute group sizes.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 2, 2, 3, 3, 3],</span> |
| <span class="sd"> ... 'B': [1, 1, 2, 3, 3, 3]},</span> |
| <span class="sd"> ... columns=['A', 'B'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 1</span> |
| <span class="sd"> 1 2 1</span> |
| <span class="sd"> 2 2 2</span> |
| <span class="sd"> 3 3 3</span> |
| <span class="sd"> 4 3 3</span> |
| <span class="sd"> 5 3 3</span> |
| |
| <span class="sd"> >>> df.groupby('A').size().sort_index()</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> 3 3</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> df.groupby(['A', 'B']).size().sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 1 1 1</span> |
| <span class="sd"> 2 1 1</span> |
| <span class="sd"> 2 1</span> |
| <span class="sd"> 3 3 3</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> For Series,</span> |
| |
| <span class="sd"> >>> df.B.groupby(df.A).size().sort_index()</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> 3 3</span> |
| <span class="sd"> Name: B, dtype: int64</span> |
| |
| <span class="sd"> >>> df.groupby(df.A).B.size().sort_index()</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> 3 3</span> |
| <span class="sd"> Name: B, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="n">groupkeys</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span> |
| <span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">))]</span> |
| <span class="n">groupkey_scols</span> <span class="o">=</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)]</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="n">groupkey_scols</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span> |
| <span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">count</span><span class="p">()</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">groupkeys</span><span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="kc">None</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">"count"</span><span class="p">)],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.diff"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.diff.html#pyspark.pandas.groupby.GroupBy.diff">[docs]</a> <span class="k">def</span> <span class="nf">diff</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> First discrete difference of element.</span> |
| |
| <span class="sd"> Calculates the difference of a DataFrame element compared with another element in the</span> |
| <span class="sd"> DataFrame group (default is the element in the same column of the previous row).</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> periods : int, default 1</span> |
| <span class="sd"> Periods to shift for calculating difference, accepts negative values.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> diffed : DataFrame or Series</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, 4, 5, 6],</span> |
| <span class="sd"> ... 'b': [1, 1, 2, 3, 5, 8],</span> |
| <span class="sd"> ... 'c': [1, 4, 9, 16, 25, 36]}, columns=['a', 'b', 'c'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 0 1 1 1</span> |
| <span class="sd"> 1 2 1 4</span> |
| <span class="sd"> 2 3 2 9</span> |
| <span class="sd"> 3 4 3 16</span> |
| <span class="sd"> 4 5 5 25</span> |
| <span class="sd"> 5 6 8 36</span> |
| |
| <span class="sd"> >>> df.groupby(['b']).diff().sort_index()</span> |
| <span class="sd"> a c</span> |
| <span class="sd"> 0 NaN NaN</span> |
| <span class="sd"> 1 1.0 3.0</span> |
| <span class="sd"> 2 NaN NaN</span> |
| <span class="sd"> 3 NaN NaN</span> |
| <span class="sd"> 4 NaN NaN</span> |
| <span class="sd"> 5 NaN NaN</span> |
| |
| <span class="sd"> Difference with previous column in a group.</span> |
| |
| <span class="sd"> >>> df.groupby(['b'])['a'].diff().sort_index()</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> 1 1.0</span> |
| <span class="sd"> 2 NaN</span> |
| <span class="sd"> 3 NaN</span> |
| <span class="sd"> 4 NaN</span> |
| <span class="sd"> 5 NaN</span> |
| <span class="sd"> Name: a, dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_diff</span><span class="p">(</span><span class="n">periods</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span> <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.cumcount"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.cumcount.html#pyspark.pandas.groupby.GroupBy.cumcount">[docs]</a> <span class="k">def</span> <span class="nf">cumcount</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="n">Series</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Number each item in each group from 0 to the length of that group - 1.</span> |
| |
| <span class="sd"> Essentially this is equivalent to</span> |
| |
| <span class="sd"> .. code-block:: python</span> |
| |
| <span class="sd"> self.apply(lambda x: pd.Series(np.arange(len(x)), x.index))</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> ascending : bool, default True</span> |
| <span class="sd"> If False, number in reverse, from length of group - 1 to 0.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Sequence number of each element within each group.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],</span> |
| <span class="sd"> ... columns=['A'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 0 a</span> |
| <span class="sd"> 1 a</span> |
| <span class="sd"> 2 a</span> |
| <span class="sd"> 3 b</span> |
| <span class="sd"> 4 b</span> |
| <span class="sd"> 5 a</span> |
| <span class="sd"> >>> df.groupby('A').cumcount().sort_index()</span> |
| <span class="sd"> 0 0</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> 3 0</span> |
| <span class="sd"> 4 1</span> |
| <span class="sd"> 5 3</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> >>> df.groupby('A').cumcount(ascending=False).sort_index()</span> |
| <span class="sd"> 0 3</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 1</span> |
| <span class="sd"> 3 1</span> |
| <span class="sd"> 4 0</span> |
| <span class="sd"> 5 0</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="n">ret</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="o">.</span><span class="n">rename</span><span class="p">()</span> |
| <span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="k">lambda</span> <span class="n">_</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">_cum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">,</span> <span class="n">ascending</span><span class="o">=</span><span class="n">ascending</span><span class="p">)</span> |
| <span class="o">-</span> <span class="mi">1</span> |
| <span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">ret</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.cummax"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.cummax.html#pyspark.pandas.groupby.GroupBy.cummax">[docs]</a> <span class="k">def</span> <span class="nf">cummax</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Cumulative max for each group.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.cummax</span> |
| <span class="sd"> DataFrame.cummax</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame(</span> |
| <span class="sd"> ... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],</span> |
| <span class="sd"> ... columns=list('ABC'))</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 0 1 NaN 4</span> |
| <span class="sd"> 1 1 0.1 3</span> |
| <span class="sd"> 2 1 20.0 2</span> |
| <span class="sd"> 3 4 10.0 1</span> |
| |
| <span class="sd"> By default, iterates over rows and finds the sum in each column.</span> |
| |
| <span class="sd"> >>> df.groupby("A").cummax().sort_index()</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> 0 NaN 4</span> |
| <span class="sd"> 1 0.1 4</span> |
| <span class="sd"> 2 20.0 4</span> |
| <span class="sd"> 3 10.0 1</span> |
| |
| <span class="sd"> It works as below in Series.</span> |
| |
| <span class="sd"> >>> df.C.groupby(df.A).cummax().sort_index()</span> |
| <span class="sd"> 0 4</span> |
| <span class="sd"> 1 4</span> |
| <span class="sd"> 2 4</span> |
| <span class="sd"> 3 1</span> |
| <span class="sd"> Name: C, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_cum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span> |
| <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.cummin"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.cummin.html#pyspark.pandas.groupby.GroupBy.cummin">[docs]</a> <span class="k">def</span> <span class="nf">cummin</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Cumulative min for each group.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.cummin</span> |
| <span class="sd"> DataFrame.cummin</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame(</span> |
| <span class="sd"> ... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],</span> |
| <span class="sd"> ... columns=list('ABC'))</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 0 1 NaN 4</span> |
| <span class="sd"> 1 1 0.1 3</span> |
| <span class="sd"> 2 1 20.0 2</span> |
| <span class="sd"> 3 4 10.0 1</span> |
| |
| <span class="sd"> By default, iterates over rows and finds the sum in each column.</span> |
| |
| <span class="sd"> >>> df.groupby("A").cummin().sort_index()</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> 0 NaN 4</span> |
| <span class="sd"> 1 0.1 3</span> |
| <span class="sd"> 2 0.1 2</span> |
| <span class="sd"> 3 10.0 1</span> |
| |
| <span class="sd"> It works as below in Series.</span> |
| |
| <span class="sd"> >>> df.B.groupby(df.A).cummin().sort_index()</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> 1 0.1</span> |
| <span class="sd"> 2 0.1</span> |
| <span class="sd"> 3 10.0</span> |
| <span class="sd"> Name: B, dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_cum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span> |
| <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.cumprod"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.cumprod.html#pyspark.pandas.groupby.GroupBy.cumprod">[docs]</a> <span class="k">def</span> <span class="nf">cumprod</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Cumulative product for each group.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.cumprod</span> |
| <span class="sd"> DataFrame.cumprod</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame(</span> |
| <span class="sd"> ... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],</span> |
| <span class="sd"> ... columns=list('ABC'))</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 0 1 NaN 4</span> |
| <span class="sd"> 1 1 0.1 3</span> |
| <span class="sd"> 2 1 20.0 2</span> |
| <span class="sd"> 3 4 10.0 1</span> |
| |
| <span class="sd"> By default, iterates over rows and finds the sum in each column.</span> |
| |
| <span class="sd"> >>> df.groupby("A").cumprod().sort_index()</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> 0 NaN 4</span> |
| <span class="sd"> 1 0.1 12</span> |
| <span class="sd"> 2 2.0 24</span> |
| <span class="sd"> 3 10.0 1</span> |
| |
| <span class="sd"> It works as below in Series.</span> |
| |
| <span class="sd"> >>> df.B.groupby(df.A).cumprod().sort_index()</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> 1 0.1</span> |
| <span class="sd"> 2 2.0</span> |
| <span class="sd"> 3 10.0</span> |
| <span class="sd"> Name: B, dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_cumprod</span><span class="p">(</span><span class="kc">True</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span> |
| <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.cumsum"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.cumsum.html#pyspark.pandas.groupby.GroupBy.cumsum">[docs]</a> <span class="k">def</span> <span class="nf">cumsum</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Cumulative sum for each group.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.cumsum</span> |
| <span class="sd"> DataFrame.cumsum</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame(</span> |
| <span class="sd"> ... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],</span> |
| <span class="sd"> ... columns=list('ABC'))</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 0 1 NaN 4</span> |
| <span class="sd"> 1 1 0.1 3</span> |
| <span class="sd"> 2 1 20.0 2</span> |
| <span class="sd"> 3 4 10.0 1</span> |
| |
| <span class="sd"> By default, iterates over rows and finds the sum in each column.</span> |
| |
| <span class="sd"> >>> df.groupby("A").cumsum().sort_index()</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> 0 NaN 4</span> |
| <span class="sd"> 1 0.1 7</span> |
| <span class="sd"> 2 20.1 9</span> |
| <span class="sd"> 3 10.0 1</span> |
| |
| <span class="sd"> It works as below in Series.</span> |
| |
| <span class="sd"> >>> df.B.groupby(df.A).cumsum().sort_index()</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> 1 0.1</span> |
| <span class="sd"> 2 20.1</span> |
| <span class="sd"> 3 10.0</span> |
| <span class="sd"> Name: B, dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_cumsum</span><span class="p">(</span><span class="kc">True</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span> |
| <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.apply"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.apply.html#pyspark.pandas.groupby.GroupBy.apply">[docs]</a> <span class="k">def</span> <span class="nf">apply</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Apply function `func` group-wise and combine the results together.</span> |
| |
| <span class="sd"> The function passed to `apply` must take a DataFrame as its first</span> |
| <span class="sd"> argument and return a DataFrame. `apply` will</span> |
| <span class="sd"> then take care of combining the results back together into a single</span> |
| <span class="sd"> dataframe. `apply` is therefore a highly flexible</span> |
| <span class="sd"> grouping method.</span> |
| |
| <span class="sd"> While `apply` is a very flexible method, its downside is that</span> |
| <span class="sd"> using it can be quite a bit slower than using more specific methods</span> |
| <span class="sd"> like `agg` or `transform`. pandas-on-Spark offers a wide range of method that will</span> |
| <span class="sd"> be much faster than using `apply` for their specific purposes, so try to</span> |
| <span class="sd"> use them before reaching for `apply`.</span> |
| |
| <span class="sd"> .. note:: this API executes the function once to infer the type which is</span> |
| <span class="sd"> potentially expensive, for instance, when the dataset is created after</span> |
| <span class="sd"> aggregations or sorting.</span> |
| |
| <span class="sd"> To avoid this, specify return type in ``func``, for instance, as below:</span> |
| |
| <span class="sd"> >>> def pandas_div(x) -> ps.DataFrame[int, [float, float]]:</span> |
| <span class="sd"> ... return x[['B', 'C']] / x[['B', 'C']]</span> |
| |
| <span class="sd"> If the return type is specified, the output column names become</span> |
| <span class="sd"> `c0, c1, c2 ... cn`. These names are positionally mapped to the returned</span> |
| <span class="sd"> DataFrame in ``func``.</span> |
| |
| <span class="sd"> To specify the column names, you can assign them in a NumPy compound type style</span> |
| <span class="sd"> as below:</span> |
| |
| <span class="sd"> >>> def pandas_div(x) -> ps.DataFrame[("index", int), [("a", float), ("b", float)]]:</span> |
| <span class="sd"> ... return x[['B', 'C']] / x[['B', 'C']]</span> |
| |
| <span class="sd"> >>> pdf = pd.DataFrame({'B': [1.], 'C': [3.]})</span> |
| <span class="sd"> >>> def plus_one(x) -> ps.DataFrame[</span> |
| <span class="sd"> ... (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]:</span> |
| <span class="sd"> ... return x[['B', 'C']] / x[['B', 'C']]</span> |
| |
| <span class="sd"> .. note:: the dataframe within ``func`` is actually a pandas dataframe. Therefore,</span> |
| <span class="sd"> any pandas API within this function is allowed.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> func : callable</span> |
| <span class="sd"> A callable that takes a DataFrame as its first argument, and</span> |
| <span class="sd"> returns a dataframe.</span> |
| <span class="sd"> *args</span> |
| <span class="sd"> Positional arguments to pass to func.</span> |
| <span class="sd"> **kwargs</span> |
| <span class="sd"> Keyword arguments to pass to func.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> applied : DataFrame or Series</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> aggregate : Apply aggregate function to the GroupBy object.</span> |
| <span class="sd"> DataFrame.apply : Apply a function to a DataFrame.</span> |
| <span class="sd"> Series.apply : Apply a function to a Series.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': 'a a b'.split(),</span> |
| <span class="sd"> ... 'B': [1, 2, 3],</span> |
| <span class="sd"> ... 'C': [4, 6, 5]}, columns=['A', 'B', 'C'])</span> |
| <span class="sd"> >>> g = df.groupby('A')</span> |
| |
| <span class="sd"> Notice that ``g`` has two groups, ``a`` and ``b``.</span> |
| <span class="sd"> Calling `apply` in various ways, we can get different grouping results:</span> |
| |
| <span class="sd"> Below the functions passed to `apply` takes a DataFrame as</span> |
| <span class="sd"> its argument and returns a DataFrame. `apply` combines the result for</span> |
| <span class="sd"> each group together into a new DataFrame:</span> |
| |
| <span class="sd"> >>> def plus_min(x):</span> |
| <span class="sd"> ... return x + x.min()</span> |
| <span class="sd"> >>> g.apply(plus_min).sort_index() # doctest: +SKIP</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 0 aa 2 8</span> |
| <span class="sd"> 1 aa 3 10</span> |
| <span class="sd"> 2 bb 6 10</span> |
| |
| <span class="sd"> >>> g.apply(sum).sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> a aa 3 10</span> |
| <span class="sd"> b b 3 5</span> |
| |
| <span class="sd"> >>> g.apply(len).sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> A</span> |
| <span class="sd"> a 2</span> |
| <span class="sd"> b 1</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> You can specify the type hint and prevent schema inference for better performance.</span> |
| |
| <span class="sd"> >>> def pandas_div(x) -> ps.DataFrame[int, [float, float]]:</span> |
| <span class="sd"> ... return x[['B', 'C']] / x[['B', 'C']]</span> |
| <span class="sd"> >>> g.apply(pandas_div).sort_index() # doctest: +SKIP</span> |
| <span class="sd"> c0 c1</span> |
| <span class="sd"> 0 1.0 1.0</span> |
| <span class="sd"> 1 1.0 1.0</span> |
| <span class="sd"> 2 1.0 1.0</span> |
| |
| <span class="sd"> >>> def pandas_div(x) -> ps.DataFrame[("index", int), [("f1", float), ("f2", float)]]:</span> |
| <span class="sd"> ... return x[['B', 'C']] / x[['B', 'C']]</span> |
| <span class="sd"> >>> g.apply(pandas_div).sort_index() # doctest: +SKIP</span> |
| <span class="sd"> f1 f2</span> |
| <span class="sd"> index</span> |
| <span class="sd"> 0 1.0 1.0</span> |
| <span class="sd"> 1 1.0 1.0</span> |
| <span class="sd"> 2 1.0 1.0</span> |
| |
| <span class="sd"> In case of Series, it works as below.</span> |
| |
| <span class="sd"> >>> def plus_max(x) -> ps.Series[int]:</span> |
| <span class="sd"> ... return x + x.max()</span> |
| <span class="sd"> >>> df.B.groupby(df.A).apply(plus_max).sort_index() # doctest: +SKIP</span> |
| <span class="sd"> 0 6</span> |
| <span class="sd"> 1 3</span> |
| <span class="sd"> 2 4</span> |
| <span class="sd"> Name: B, dtype: int64</span> |
| |
| <span class="sd"> >>> def plus_min(x):</span> |
| <span class="sd"> ... return x + x.min()</span> |
| <span class="sd"> >>> df.B.groupby(df.A).apply(plus_min).sort_index() # doctest: +SKIP</span> |
| <span class="sd"> 0 2</span> |
| <span class="sd"> 1 3</span> |
| <span class="sd"> 2 6</span> |
| <span class="sd"> Name: B, dtype: int64</span> |
| |
| <span class="sd"> You can also return a scalar value as an aggregated value of the group:</span> |
| |
| <span class="sd"> >>> def plus_length(x) -> int:</span> |
| <span class="sd"> ... return len(x)</span> |
| <span class="sd"> >>> df.B.groupby(df.A).apply(plus_length).sort_index() # doctest: +SKIP</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> Name: B, dtype: int64</span> |
| |
| <span class="sd"> The extra arguments to the function can be passed as below.</span> |
| |
| <span class="sd"> >>> def calculation(x, y, z) -> int:</span> |
| <span class="sd"> ... return len(x) + y * z</span> |
| <span class="sd"> >>> df.B.groupby(df.A).apply(calculation, 5, z=10).sort_index() # doctest: +SKIP</span> |
| <span class="sd"> 0 51</span> |
| <span class="sd"> 1 52</span> |
| <span class="sd"> Name: B, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">callable</span><span class="p">(</span><span class="n">func</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"</span><span class="si">%s</span><span class="s2"> object is not callable"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| |
| <span class="n">spec</span> <span class="o">=</span> <span class="n">inspect</span><span class="o">.</span><span class="n">getfullargspec</span><span class="p">(</span><span class="n">func</span><span class="p">)</span> |
| <span class="n">return_sig</span> <span class="o">=</span> <span class="n">spec</span><span class="o">.</span><span class="n">annotations</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"return"</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span> |
| <span class="n">should_infer_schema</span> <span class="o">=</span> <span class="n">return_sig</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="n">should_retain_index</span> <span class="o">=</span> <span class="n">should_infer_schema</span> |
| |
| <span class="n">is_series_groupby</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">SeriesGroupBy</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span> |
| |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span><span class="p">:</span> |
| <span class="n">agg_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_column_labels_to_exclude</span> |
| <span class="p">]</span> |
| |
| <span class="n">psdf</span><span class="p">,</span> <span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">groupkey_names</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_prepare_group_map_apply</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">agg_columns</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span> |
| <span class="n">name</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> |
| <span class="n">pandas_apply</span> <span class="o">=</span> <span class="n">_builtin_table</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">func</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">f</span> <span class="o">=</span> <span class="n">_builtin_table</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">func</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">pandas_apply</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="o">*</span><span class="n">a</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">k</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Any</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">f</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">),</span> <span class="o">*</span><span class="n">a</span><span class="p">,</span> <span class="o">**</span><span class="n">k</span><span class="p">)</span> |
| |
| <span class="n">should_return_series</span> <span class="o">=</span> <span class="kc">False</span> |
| |
| <span class="k">if</span> <span class="n">should_infer_schema</span><span class="p">:</span> |
| <span class="c1"># Here we execute with the first 1000 to get the return type.</span> |
| <span class="n">log_advice</span><span class="p">(</span> |
| <span class="s2">"If the type hints is not specified for `groupby.apply`, "</span> |
| <span class="s2">"it is expensive to infer the data type internally."</span> |
| <span class="p">)</span> |
| <span class="n">limit</span> <span class="o">=</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">"compute.shortcut_limit"</span><span class="p">)</span> |
| <span class="c1"># Ensure sampling rows >= 2 to make sure apply's infer schema is accurate</span> |
| <span class="c1"># See related: https://github.com/pandas-dev/pandas/issues/46893</span> |
| <span class="n">sample_limit</span> <span class="o">=</span> <span class="n">limit</span> <span class="o">+</span> <span class="mi">1</span> <span class="k">if</span> <span class="n">limit</span> <span class="k">else</span> <span class="mi">2</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">sample_limit</span><span class="p">)</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span> |
| <span class="n">groupkeys</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">pdf</span><span class="p">[</span><span class="n">groupkey_name</span><span class="p">]</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">groupkey_name</span><span class="p">,</span> <span class="n">psser</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="n">grouped</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span> |
| <span class="n">pser_or_pdf</span> <span class="o">=</span> <span class="n">grouped</span><span class="p">[</span><span class="n">name</span><span class="p">]</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">pandas_apply</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">pser_or_pdf</span> <span class="o">=</span> <span class="n">grouped</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">pandas_apply</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| <span class="n">psser_or_psdf</span> <span class="o">=</span> <span class="n">ps</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span><span class="n">pser_or_pdf</span><span class="o">.</span><span class="n">infer_objects</span><span class="p">())</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="o"><=</span> <span class="n">limit</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser_or_psdf</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="ow">and</span> <span class="n">is_series_groupby</span><span class="p">:</span> |
| <span class="n">psser_or_psdf</span> <span class="o">=</span> <span class="n">psser_or_psdf</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">SeriesGroupBy</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">],</span> <span class="n">psser_or_psdf</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">grouped</span><span class="p">)</span> <span class="o"><=</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">with</span> <span class="n">warnings</span><span class="o">.</span><span class="n">catch_warnings</span><span class="p">():</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">simplefilter</span><span class="p">(</span><span class="s2">"always"</span><span class="p">)</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span> |
| <span class="s2">"The amount of data for return type inference might not be large enough. "</span> |
| <span class="s2">"Consider increasing an option `compute.shortcut_limit`."</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser_or_psdf</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="n">should_return_series</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="n">psdf_from_pandas</span> <span class="o">=</span> <span class="n">psser_or_psdf</span><span class="o">.</span><span class="n">_psdf</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">psdf_from_pandas</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">psser_or_psdf</span><span class="p">)</span> |
| |
| <span class="n">index_fields</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">field</span><span class="o">.</span><span class="n">normalize_spark_type</span><span class="p">()</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span> |
| <span class="p">]</span> |
| <span class="n">data_fields</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">field</span><span class="o">.</span><span class="n">normalize_spark_type</span><span class="p">()</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span> |
| <span class="p">]</span> |
| <span class="n">return_schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span><span class="n">field</span><span class="o">.</span><span class="n">struct_field</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">index_fields</span> <span class="o">+</span> <span class="n">data_fields</span><span class="p">])</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">return_type</span> <span class="o">=</span> <span class="n">infer_return_type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_series_groupby</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">return_type</span><span class="p">,</span> <span class="n">SeriesType</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Series as a return type hint at frame groupby is not supported "</span> |
| <span class="s2">"currently; however got [</span><span class="si">%s</span><span class="s2">]. Use DataFrame type hint instead."</span> <span class="o">%</span> <span class="n">return_sig</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">return_type</span><span class="p">,</span> <span class="n">DataFrameType</span><span class="p">):</span> |
| <span class="n">data_fields</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">data_fields</span> |
| <span class="n">return_schema</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">spark_type</span> |
| <span class="n">index_fields</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">index_fields</span> |
| <span class="n">should_retain_index</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">index_fields</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span> |
| <span class="n">psdf_from_pandas</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">should_return_series</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="n">dtype</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">SeriesType</span><span class="p">,</span> <span class="n">ScalarType</span><span class="p">],</span> <span class="n">return_type</span><span class="p">)</span><span class="o">.</span><span class="n">dtype</span> |
| <span class="n">spark_type</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">SeriesType</span><span class="p">,</span> <span class="n">ScalarType</span><span class="p">],</span> <span class="n">return_type</span><span class="p">)</span><span class="o">.</span><span class="n">spark_type</span> |
| <span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span> |
| <span class="n">data_fields</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">InternalField</span><span class="p">(</span> |
| <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">struct_field</span><span class="o">=</span><span class="n">StructField</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">,</span> <span class="n">dataType</span><span class="o">=</span><span class="n">spark_type</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">data_fields</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">InternalField</span><span class="p">(</span> |
| <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> |
| <span class="n">struct_field</span><span class="o">=</span><span class="n">StructField</span><span class="p">(</span> |
| <span class="n">name</span><span class="o">=</span><span class="n">SPARK_DEFAULT_SERIES_NAME</span><span class="p">,</span> <span class="n">dataType</span><span class="o">=</span><span class="n">spark_type</span> |
| <span class="p">),</span> |
| <span class="p">)</span> |
| <span class="p">]</span> |
| <span class="n">return_schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span><span class="n">field</span><span class="o">.</span><span class="n">struct_field</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">data_fields</span><span class="p">])</span> |
| |
| <span class="k">def</span> <span class="nf">pandas_groupby_apply</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span> |
| <span class="n">pdf_or_ser</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)[</span><span class="n">name</span><span class="p">]</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">pandas_apply</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">pdf_or_ser</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">pandas_apply</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">should_return_series</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pdf_or_ser</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span> |
| <span class="n">pdf_or_ser</span> <span class="o">=</span> <span class="n">pdf_or_ser</span><span class="o">.</span><span class="n">stack</span><span class="p">()</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pdf_or_ser</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">pdf_or_ser</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">pdf_or_ser</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_group_map_apply</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">,</span> |
| <span class="n">pandas_groupby_apply</span><span class="p">,</span> |
| <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">],</span> |
| <span class="n">return_schema</span><span class="p">,</span> |
| <span class="n">retain_index</span><span class="o">=</span><span class="n">should_retain_index</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">should_retain_index</span><span class="p">:</span> |
| <span class="c1"># If schema is inferred, we can restore indexes too.</span> |
| <span class="k">if</span> <span class="n">psdf_from_pandas</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_fields</span><span class="o">=</span><span class="n">index_fields</span><span class="p">,</span> <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">index_names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Optional</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="o">...</span><span class="p">]]]]</span> <span class="o">=</span> <span class="kc">None</span> |
| |
| <span class="n">index_spark_columns</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_field</span><span class="o">.</span><span class="n">struct_field</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">index_field</span> <span class="ow">in</span> <span class="n">index_fields</span> |
| <span class="p">]</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">any</span><span class="p">(</span> |
| <span class="p">[</span> |
| <span class="n">SPARK_INDEX_NAME_PATTERN</span><span class="o">.</span><span class="n">match</span><span class="p">(</span><span class="n">index_field</span><span class="o">.</span><span class="n">struct_field</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">index_field</span> <span class="ow">in</span> <span class="n">index_fields</span> |
| <span class="p">]</span> |
| <span class="p">):</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="p">[(</span><span class="n">index_field</span><span class="o">.</span><span class="n">struct_field</span><span class="o">.</span><span class="n">name</span><span class="p">,)</span> <span class="k">for</span> <span class="n">index_field</span> <span class="ow">in</span> <span class="n">index_fields</span><span class="p">]</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="n">index_fields</span><span class="p">,</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="c1"># Otherwise, it loses index.</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">should_return_series</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span> |
| <span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">SeriesGroupBy</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">psser</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div> |
| |
| <span class="c1"># TODO: implement 'dropna' parameter</span> |
| <div class="viewcode-block" id="GroupBy.filter"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.filter.html#pyspark.pandas.groupby.GroupBy.filter">[docs]</a> <span class="k">def</span> <span class="nf">filter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">FrameLike</span><span class="p">],</span> <span class="n">FrameLike</span><span class="p">])</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return a copy of a DataFrame excluding elements from groups that</span> |
| <span class="sd"> do not satisfy the boolean criterion specified by func.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> f : function</span> |
| <span class="sd"> Function to apply to each subframe. Should return True or False.</span> |
| <span class="sd"> dropna : Drop groups that do not pass the filter. True by default;</span> |
| <span class="sd"> if False, groups that evaluate False are filled with NaNs.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> filtered : DataFrame or Series</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Each subframe is endowed the attribute 'name' in case you need to know</span> |
| <span class="sd"> which group you are working on.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',</span> |
| <span class="sd"> ... 'foo', 'bar'],</span> |
| <span class="sd"> ... 'B' : [1, 2, 3, 4, 5, 6],</span> |
| <span class="sd"> ... 'C' : [2.0, 5., 8., 1., 2., 9.]}, columns=['A', 'B', 'C'])</span> |
| <span class="sd"> >>> grouped = df.groupby('A')</span> |
| <span class="sd"> >>> grouped.filter(lambda x: x['B'].mean() > 3.)</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 1 bar 2 5.0</span> |
| <span class="sd"> 3 bar 4 1.0</span> |
| <span class="sd"> 5 bar 6 9.0</span> |
| |
| <span class="sd"> >>> df.B.groupby(df.A).filter(lambda x: x.mean() > 3.)</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 3 4</span> |
| <span class="sd"> 5 6</span> |
| <span class="sd"> Name: B, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">callable</span><span class="p">(</span><span class="n">func</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"</span><span class="si">%s</span><span class="s2"> object is not callable"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| |
| <span class="n">is_series_groupby</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">SeriesGroupBy</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span> |
| |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span><span class="p">:</span> |
| <span class="n">agg_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_column_labels_to_exclude</span> |
| <span class="p">]</span> |
| |
| <span class="n">data_schema</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">psdf</span><span class="p">[</span><span class="n">agg_columns</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="o">*</span><span class="n">HIDDEN_COLUMNS</span><span class="p">)</span><span class="o">.</span><span class="n">schema</span> |
| <span class="p">)</span> |
| |
| <span class="n">psdf</span><span class="p">,</span> <span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">groupkey_names</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_prepare_group_map_apply</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">agg_columns</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span> |
| |
| <span class="k">def</span> <span class="nf">pandas_filter</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)[</span><span class="n">pdf</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]]</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">func</span><span class="p">))</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">f</span> <span class="o">=</span> <span class="n">_builtin_table</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">func</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">wrapped_func</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">f</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">))</span> |
| |
| <span class="k">def</span> <span class="nf">pandas_filter</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">wrapped_func</span><span class="p">)</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_group_map_apply</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">,</span> |
| <span class="n">pandas_filter</span><span class="p">,</span> |
| <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">],</span> |
| <span class="n">data_schema</span><span class="p">,</span> |
| <span class="n">retain_index</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">[</span><span class="n">agg_columns</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">sdf</span><span class="p">))</span> |
| <span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">FrameLike</span><span class="p">,</span> <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">FrameLike</span><span class="p">,</span> <span class="n">psdf</span><span class="p">)</span></div> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_prepare_group_map_apply</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">groupkeys</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span> <span class="n">agg_columns</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]:</span> |
| <span class="n">groupkey_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="s2">"__groupkey_</span><span class="si">{}</span><span class="s2">__"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">))</span> |
| <span class="p">]</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[[</span><span class="n">s</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_labels</span><span class="p">)]</span> <span class="o">+</span> <span class="n">agg_columns</span><span class="p">]</span> |
| <span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">label</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span> <span class="k">else</span> <span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">]</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="p">),</span> <span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">groupkey_names</span> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_spark_group_map_apply</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> |
| <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span> |
| <span class="n">groupkeys_scols</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">],</span> |
| <span class="n">return_schema</span><span class="p">:</span> <span class="n">StructType</span><span class="p">,</span> |
| <span class="n">retain_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">SparkDataFrame</span><span class="p">:</span> |
| <span class="n">output_func</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_make_pandas_df_builder_func</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">func</span><span class="p">,</span> <span class="n">return_schema</span><span class="p">,</span> <span class="n">retain_index</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="o">*</span><span class="n">HIDDEN_COLUMNS</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkeys_scols</span><span class="p">)</span><span class="o">.</span><span class="n">applyInPandas</span><span class="p">(</span><span class="n">output_func</span><span class="p">,</span> <span class="n">return_schema</span><span class="p">)</span> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_make_pandas_df_builder_func</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> |
| <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span> |
| <span class="n">return_schema</span><span class="p">:</span> <span class="n">StructType</span><span class="p">,</span> |
| <span class="n">retain_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Callable</span><span class="p">[[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Creates a function that can be used inside the pandas UDF. This function can construct</span> |
| <span class="sd"> the same pandas DataFrame as if the pandas-on-Spark DataFrame is collected to driver side.</span> |
| <span class="sd"> The index, column labels, etc. are re-constructed within the function.</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.utils</span> <span class="kn">import</span> <span class="n">is_timestamp_ntz_preferred</span> |
| |
| <span class="n">arguments_for_restore_index</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">arguments_for_restore_index</span> |
| <span class="n">prefer_timestamp_ntz</span> <span class="o">=</span> <span class="n">is_timestamp_ntz_preferred</span><span class="p">()</span> |
| |
| <span class="k">def</span> <span class="nf">rename_output</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="o">.</span><span class="n">restore_index</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">copy</span><span class="p">(),</span> <span class="o">**</span><span class="n">arguments_for_restore_index</span><span class="p">)</span> |
| |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">func</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> |
| |
| <span class="c1"># If schema should be inferred, we don't restore the index. pandas seems to restore</span> |
| <span class="c1"># the index in some cases.</span> |
| <span class="c1"># When Spark output type is specified, without executing it, we don't know</span> |
| <span class="c1"># if we should restore the index or not. For instance, see the example in</span> |
| <span class="c1"># https://github.com/databricks/koalas/issues/628.</span> |
| <span class="n">pdf</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="o">.</span><span class="n">prepare_pandas_frame</span><span class="p">(</span> |
| <span class="n">pdf</span><span class="p">,</span> <span class="n">retain_index</span><span class="o">=</span><span class="n">retain_index</span><span class="p">,</span> <span class="n">prefer_timestamp_ntz</span><span class="o">=</span><span class="n">prefer_timestamp_ntz</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># Just positionally map the column names to given schema's.</span> |
| <span class="n">pdf</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">return_schema</span><span class="o">.</span><span class="n">names</span> |
| |
| <span class="k">return</span> <span class="n">pdf</span> |
| |
| <span class="k">return</span> <span class="n">rename_output</span> |
| |
| <div class="viewcode-block" id="GroupBy.rank"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.rank.html#pyspark.pandas.groupby.GroupBy.rank">[docs]</a> <span class="k">def</span> <span class="nf">rank</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">method</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"average"</span><span class="p">,</span> <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Provide the rank of values within each group.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'</span> |
| <span class="sd"> * average: average rank of group</span> |
| <span class="sd"> * min: lowest rank in group</span> |
| <span class="sd"> * max: highest rank in group</span> |
| <span class="sd"> * first: ranks assigned in order they appear in the array</span> |
| <span class="sd"> * dense: like 'min', but rank always increases by 1 between groups</span> |
| <span class="sd"> ascending : boolean, default True</span> |
| <span class="sd"> False for ranks by high (1) to low (N)</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame with ranking of values within each group</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({</span> |
| <span class="sd"> ... 'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],</span> |
| <span class="sd"> ... 'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 1 1</span> |
| <span class="sd"> 1 1 2</span> |
| <span class="sd"> 2 1 2</span> |
| <span class="sd"> 3 2 2</span> |
| <span class="sd"> 4 2 3</span> |
| <span class="sd"> 5 2 3</span> |
| <span class="sd"> 6 3 3</span> |
| <span class="sd"> 7 3 4</span> |
| <span class="sd"> 8 3 4</span> |
| |
| <span class="sd"> >>> df.groupby("a").rank().sort_index()</span> |
| <span class="sd"> b</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 2.5</span> |
| <span class="sd"> 2 2.5</span> |
| <span class="sd"> 3 1.0</span> |
| <span class="sd"> 4 2.5</span> |
| <span class="sd"> 5 2.5</span> |
| <span class="sd"> 6 1.0</span> |
| <span class="sd"> 7 2.5</span> |
| <span class="sd"> 8 2.5</span> |
| |
| <span class="sd"> >>> df.b.groupby(df.a).rank(method='max').sort_index()</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 3.0</span> |
| <span class="sd"> 2 3.0</span> |
| <span class="sd"> 3 1.0</span> |
| <span class="sd"> 4 3.0</span> |
| <span class="sd"> 5 3.0</span> |
| <span class="sd"> 6 1.0</span> |
| <span class="sd"> 7 3.0</span> |
| <span class="sd"> 8 3.0</span> |
| <span class="sd"> Name: b, dtype: float64</span> |
| |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_rank</span><span class="p">(</span><span class="n">method</span><span class="p">,</span> <span class="n">ascending</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span> |
| <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.idxmax"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.idxmax.html#pyspark.pandas.groupby.GroupBy.idxmax">[docs]</a> <span class="k">def</span> <span class="nf">idxmax</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return index of first occurrence of maximum over requested axis in group.</span> |
| <span class="sd"> NA/null values are excluded.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> skipna : boolean, default True</span> |
| <span class="sd"> Exclude NA/null values. If an entire row/column is NA, the result will be NA.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.idxmax</span> |
| <span class="sd"> DataFrame.idxmax</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 1, 2, 2, 3],</span> |
| <span class="sd"> ... 'b': [1, 2, 3, 4, 5],</span> |
| <span class="sd"> ... 'c': [5, 4, 3, 2, 1]}, columns=['a', 'b', 'c'])</span> |
| |
| <span class="sd"> >>> df.groupby(['a'])['b'].idxmax().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> 3 4</span> |
| <span class="sd"> Name: b, dtype: int64</span> |
| |
| <span class="sd"> >>> df.groupby(['a']).idxmax().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> b c</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1 1 0</span> |
| <span class="sd"> 2 3 2</span> |
| <span class="sd"> 3 4 4</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"idxmax only support one-level index now"</span><span class="p">)</span> |
| |
| <span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"__groupkey_</span><span class="si">{}</span><span class="s2">__"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">):</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span> |
| <span class="n">index</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| |
| <span class="n">stat_exprs</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">scol</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_scols</span><span class="p">):</span> |
| <span class="n">name</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| |
| <span class="k">if</span> <span class="n">skipna</span><span class="p">:</span> |
| <span class="n">order_column</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">desc_nulls_last</span><span class="p">()</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">order_column</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">desc_nulls_first</span><span class="p">()</span> |
| |
| <span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span> |
| <span class="n">order_column</span><span class="p">,</span> <span class="n">NATURAL_ORDER_COLUMN_NAME</span> |
| <span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span> |
| <span class="n">name</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">,</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index</span><span class="p">))</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">stat_exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">name</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">))</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">stat_exprs</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> |
| <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> |
| <span class="p">],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_handle_output</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.idxmin"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.idxmin.html#pyspark.pandas.groupby.GroupBy.idxmin">[docs]</a> <span class="k">def</span> <span class="nf">idxmin</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return index of first occurrence of minimum over requested axis in group.</span> |
| <span class="sd"> NA/null values are excluded.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> skipna : boolean, default True</span> |
| <span class="sd"> Exclude NA/null values. If an entire row/column is NA, the result will be NA.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.idxmin</span> |
| <span class="sd"> DataFrame.idxmin</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 1, 2, 2, 3],</span> |
| <span class="sd"> ... 'b': [1, 2, 3, 4, 5],</span> |
| <span class="sd"> ... 'c': [5, 4, 3, 2, 1]}, columns=['a', 'b', 'c'])</span> |
| |
| <span class="sd"> >>> df.groupby(['a'])['b'].idxmin().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1 0</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> 3 4</span> |
| <span class="sd"> Name: b, dtype: int64</span> |
| |
| <span class="sd"> >>> df.groupby(['a']).idxmin().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> b c</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1 0 1</span> |
| <span class="sd"> 2 2 3</span> |
| <span class="sd"> 3 4 4</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"idxmin only support one-level index now"</span><span class="p">)</span> |
| |
| <span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"__groupkey_</span><span class="si">{}</span><span class="s2">__"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">):</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span> |
| <span class="n">index</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| |
| <span class="n">stat_exprs</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">scol</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_scols</span><span class="p">):</span> |
| <span class="n">name</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| |
| <span class="k">if</span> <span class="n">skipna</span><span class="p">:</span> |
| <span class="n">order_column</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">asc_nulls_last</span><span class="p">()</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">order_column</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">asc_nulls_first</span><span class="p">()</span> |
| |
| <span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span> |
| <span class="n">order_column</span><span class="p">,</span> <span class="n">NATURAL_ORDER_COLUMN_NAME</span> |
| <span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span> |
| <span class="n">name</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">,</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index</span><span class="p">))</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">stat_exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">name</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">))</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">stat_exprs</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> |
| <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> |
| <span class="p">],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_handle_output</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.fillna"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.fillna.html#pyspark.pandas.groupby.GroupBy.fillna">[docs]</a> <span class="k">def</span> <span class="nf">fillna</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">method</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Fill NA/NaN values in group.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> value : scalar, dict, Series</span> |
| <span class="sd"> Value to use to fill holes. alternately a dict/Series of values</span> |
| <span class="sd"> specifying which value to use for each column.</span> |
| <span class="sd"> DataFrame is not supported.</span> |
| <span class="sd"> method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None</span> |
| <span class="sd"> Method to use for filling holes in reindexed Series pad / ffill: propagate last valid</span> |
| <span class="sd"> observation forward to next valid backfill / bfill:</span> |
| <span class="sd"> use NEXT valid observation to fill gap</span> |
| |
| <span class="sd"> .. deprecated:: 4.0.0</span> |
| |
| <span class="sd"> axis : {0 or `index`}</span> |
| <span class="sd"> 1 and `columns` are not supported.</span> |
| |
| <span class="sd"> .. deprecated:: 4.0.0</span> |
| <span class="sd"> For axis=1, operate on the underlying object instead.</span> |
| <span class="sd"> Otherwise the axis keyword is not necessary.</span> |
| |
| <span class="sd"> inplace : boolean, default False</span> |
| <span class="sd"> Fill in place (do not create a new object)</span> |
| <span class="sd"> limit : int, default None</span> |
| <span class="sd"> If method is specified, this is the maximum number of consecutive NaN values to</span> |
| <span class="sd"> forward/backward fill. In other words, if there is a gap with more than this number of</span> |
| <span class="sd"> consecutive NaNs, it will only be partially filled. If method is not specified,</span> |
| <span class="sd"> this is the maximum number of entries along the entire axis where NaNs will be filled.</span> |
| <span class="sd"> Must be greater than 0 if not None</span> |
| |
| <span class="sd"> .. deprecated:: 4.0.0</span> |
| |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> DataFrame with NA entries filled.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({</span> |
| <span class="sd"> ... 'A': [1, 1, 2, 2],</span> |
| <span class="sd"> ... 'B': [2, 4, None, 3],</span> |
| <span class="sd"> ... 'C': [None, None, None, 1],</span> |
| <span class="sd"> ... 'D': [0, 1, 5, 4]</span> |
| <span class="sd"> ... },</span> |
| <span class="sd"> ... columns=['A', 'B', 'C', 'D'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C D</span> |
| <span class="sd"> 0 1 2.0 NaN 0</span> |
| <span class="sd"> 1 1 4.0 NaN 1</span> |
| <span class="sd"> 2 2 NaN NaN 5</span> |
| <span class="sd"> 3 2 3.0 1.0 4</span> |
| |
| <span class="sd"> We can also propagate non-null values forward or backward in group.</span> |
| |
| <span class="sd"> >>> df.groupby(['A'])['B'].fillna(method='ffill').sort_index()</span> |
| <span class="sd"> 0 2.0</span> |
| <span class="sd"> 1 4.0</span> |
| <span class="sd"> 2 NaN</span> |
| <span class="sd"> 3 3.0</span> |
| <span class="sd"> Name: B, dtype: float64</span> |
| |
| <span class="sd"> >>> df.groupby(['A']).fillna(method='bfill').sort_index()</span> |
| <span class="sd"> B C D</span> |
| <span class="sd"> 0 2.0 NaN 0</span> |
| <span class="sd"> 1 4.0 NaN 1</span> |
| <span class="sd"> 2 3.0 1.0 5</span> |
| <span class="sd"> 3 3.0 1.0 4</span> |
| <span class="sd"> """</span> |
| <span class="n">should_resolve</span> <span class="o">=</span> <span class="n">method</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| <span class="k">if</span> <span class="n">should_resolve</span><span class="p">:</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span> |
| <span class="s2">"DataFrameGroupBy.fillna with 'method' is deprecated "</span> |
| <span class="s2">"and will raise in a future version. "</span> |
| <span class="s2">"Use DataFrameGroupBy.ffill() or DataFrameGroupBy.bfill() instead."</span><span class="p">,</span> |
| <span class="ne">FutureWarning</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_fillna</span><span class="p">(</span> |
| <span class="n">value</span><span class="o">=</span><span class="n">value</span><span class="p">,</span> <span class="n">method</span><span class="o">=</span><span class="n">method</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span> |
| <span class="p">),</span> |
| <span class="n">should_resolve</span><span class="o">=</span><span class="n">should_resolve</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.bfill"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.bfill.html#pyspark.pandas.groupby.GroupBy.bfill">[docs]</a> <span class="k">def</span> <span class="nf">bfill</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Synonym for `DataFrame.fillna()` with ``method=`bfill```.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {0 or `index`}</span> |
| <span class="sd"> 1 and `columns` are not supported.</span> |
| <span class="sd"> inplace : boolean, default False</span> |
| <span class="sd"> Fill in place (do not create a new object)</span> |
| <span class="sd"> limit : int, default None</span> |
| <span class="sd"> If method is specified, this is the maximum number of consecutive NaN values to</span> |
| <span class="sd"> forward/backward fill. In other words, if there is a gap with more than this number of</span> |
| <span class="sd"> consecutive NaNs, it will only be partially filled. If method is not specified,</span> |
| <span class="sd"> this is the maximum number of entries along the entire axis where NaNs will be filled.</span> |
| <span class="sd"> Must be greater than 0 if not None</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> DataFrame with NA entries filled.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({</span> |
| <span class="sd"> ... 'A': [1, 1, 2, 2],</span> |
| <span class="sd"> ... 'B': [2, 4, None, 3],</span> |
| <span class="sd"> ... 'C': [None, None, None, 1],</span> |
| <span class="sd"> ... 'D': [0, 1, 5, 4]</span> |
| <span class="sd"> ... },</span> |
| <span class="sd"> ... columns=['A', 'B', 'C', 'D'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C D</span> |
| <span class="sd"> 0 1 2.0 NaN 0</span> |
| <span class="sd"> 1 1 4.0 NaN 1</span> |
| <span class="sd"> 2 2 NaN NaN 5</span> |
| <span class="sd"> 3 2 3.0 1.0 4</span> |
| |
| <span class="sd"> Propagate non-null values backward.</span> |
| |
| <span class="sd"> >>> df.groupby(['A']).bfill().sort_index()</span> |
| <span class="sd"> B C D</span> |
| <span class="sd"> 0 2.0 NaN 0</span> |
| <span class="sd"> 1 4.0 NaN 1</span> |
| <span class="sd"> 2 3.0 1.0 5</span> |
| <span class="sd"> 3 3.0 1.0 4</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="n">method</span><span class="o">=</span><span class="s2">"bfill"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.ffill"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.ffill.html#pyspark.pandas.groupby.GroupBy.ffill">[docs]</a> <span class="k">def</span> <span class="nf">ffill</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Synonym for `DataFrame.fillna()` with ``method=`ffill```.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {0 or `index`}</span> |
| <span class="sd"> 1 and `columns` are not supported.</span> |
| <span class="sd"> inplace : boolean, default False</span> |
| <span class="sd"> Fill in place (do not create a new object)</span> |
| <span class="sd"> limit : int, default None</span> |
| <span class="sd"> If method is specified, this is the maximum number of consecutive NaN values to</span> |
| <span class="sd"> forward/backward fill. In other words, if there is a gap with more than this number of</span> |
| <span class="sd"> consecutive NaNs, it will only be partially filled. If method is not specified,</span> |
| <span class="sd"> this is the maximum number of entries along the entire axis where NaNs will be filled.</span> |
| <span class="sd"> Must be greater than 0 if not None</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> DataFrame with NA entries filled.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({</span> |
| <span class="sd"> ... 'A': [1, 1, 2, 2],</span> |
| <span class="sd"> ... 'B': [2, 4, None, 3],</span> |
| <span class="sd"> ... 'C': [None, None, None, 1],</span> |
| <span class="sd"> ... 'D': [0, 1, 5, 4]</span> |
| <span class="sd"> ... },</span> |
| <span class="sd"> ... columns=['A', 'B', 'C', 'D'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C D</span> |
| <span class="sd"> 0 1 2.0 NaN 0</span> |
| <span class="sd"> 1 1 4.0 NaN 1</span> |
| <span class="sd"> 2 2 NaN NaN 5</span> |
| <span class="sd"> 3 2 3.0 1.0 4</span> |
| |
| <span class="sd"> Propagate non-null values forward.</span> |
| |
| <span class="sd"> >>> df.groupby(['A']).ffill().sort_index()</span> |
| <span class="sd"> B C D</span> |
| <span class="sd"> 0 2.0 NaN 0</span> |
| <span class="sd"> 1 4.0 NaN 1</span> |
| <span class="sd"> 2 NaN NaN 5</span> |
| <span class="sd"> 3 3.0 1.0 4</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="n">method</span><span class="o">=</span><span class="s2">"ffill"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_limit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">asc</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Private function for tail and head.</span> |
| <span class="sd"> """</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span> |
| |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span><span class="p">:</span> |
| <span class="n">agg_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_column_labels_to_exclude</span> |
| <span class="p">]</span> |
| |
| <span class="n">psdf</span><span class="p">,</span> <span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_prepare_group_map_apply</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">,</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> |
| <span class="n">agg_columns</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">groupkey_scols</span> <span class="o">=</span> <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">]</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| |
| <span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_scols</span><span class="p">)</span> |
| <span class="c1"># This part is handled differently depending on whether it is a tail or a head.</span> |
| <span class="n">ordered_window</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">window</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span><span class="o">.</span><span class="n">asc</span><span class="p">())</span> |
| <span class="k">if</span> <span class="n">asc</span> |
| <span class="k">else</span> <span class="n">window</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span><span class="o">.</span><span class="n">desc</span><span class="p">())</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">n</span> <span class="o">>=</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">tmp_row_num_col</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">"__row_number__"</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">tmp_row_num_col</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">ordered_window</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">tmp_row_num_col</span><span class="p">)</span> <span class="o"><=</span> <span class="n">n</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">tmp_row_num_col</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="c1"># Pandas supports Groupby positional indexing since v1.4.0</span> |
| <span class="c1"># https://pandas.pydata.org/docs/whatsnew/v1.4.0.html#groupby-positional-indexing</span> |
| <span class="c1">#</span> |
| <span class="c1"># To support groupby positional indexing, we need add a `__tmp_lag__` column to help</span> |
| <span class="c1"># us filtering rows before the specified offset row.</span> |
| <span class="c1">#</span> |
| <span class="c1"># For example for the dataframe:</span> |
| <span class="c1"># >>> df = ps.DataFrame([["g", "g0"],</span> |
| <span class="c1"># ... ["g", "g1"],</span> |
| <span class="c1"># ... ["g", "g2"],</span> |
| <span class="c1"># ... ["g", "g3"],</span> |
| <span class="c1"># ... ["h", "h0"],</span> |
| <span class="c1"># ... ["h", "h1"]], columns=["A", "B"])</span> |
| <span class="c1"># >>> df.groupby("A").head(-1)</span> |
| <span class="c1">#</span> |
| <span class="c1"># Below is a result to show the `__tmp_lag__` column for above df, the limit n is</span> |
| <span class="c1"># `-1`, the `__tmp_lag__` will be set to `0` in rows[:-1], and left will be set to</span> |
| <span class="c1"># `null`:</span> |
| <span class="c1">#</span> |
| <span class="c1"># >>> sdf.withColumn(tmp_lag_col, F.lag(F.lit(0), -1).over(ordered_window))</span> |
| <span class="c1"># +-----------------+--------------+---+---+-----------------+-----------+</span> |
| <span class="c1"># |__index_level_0__|__groupkey_0__| A| B|__natural_order__|__tmp_lag__|</span> |
| <span class="c1"># +-----------------+--------------+---+---+-----------------+-----------+</span> |
| <span class="c1"># | 0| g| g| g0| 0| 0|</span> |
| <span class="c1"># | 1| g| g| g1| 8589934592| 0|</span> |
| <span class="c1"># | 2| g| g| g2| 17179869184| 0|</span> |
| <span class="c1"># | 3| g| g| g3| 25769803776| null|</span> |
| <span class="c1"># | 4| h| h| h0| 34359738368| 0|</span> |
| <span class="c1"># | 5| h| h| h1| 42949672960| null|</span> |
| <span class="c1"># +-----------------+--------------+---+---+-----------------+-----------+</span> |
| <span class="c1">#</span> |
| <span class="n">tmp_lag_col</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">"__tmp_lag__"</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">tmp_lag_col</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">lag</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> <span class="n">n</span><span class="p">)</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">ordered_window</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="o">~</span><span class="n">F</span><span class="o">.</span><span class="n">isnull</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">tmp_lag_col</span><span class="p">)))</span> |
| <span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">tmp_lag_col</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">sdf</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_handle_output</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">))</span> |
| |
| <div class="viewcode-block" id="GroupBy.head"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.head.html#pyspark.pandas.groupby.GroupBy.head">[docs]</a> <span class="k">def</span> <span class="nf">head</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return first n rows of each group.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame or Series</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 1, 1, 1, 2, 2, 2, 3, 3, 3],</span> |
| <span class="sd"> ... 'b': [2, 3, 1, 4, 6, 9, 8, 10, 7, 5],</span> |
| <span class="sd"> ... 'c': [3, 5, 2, 5, 1, 2, 6, 4, 3, 6]},</span> |
| <span class="sd"> ... columns=['a', 'b', 'c'],</span> |
| <span class="sd"> ... index=[7, 2, 4, 1, 3, 4, 9, 10, 5, 6])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 7 1 2 3</span> |
| <span class="sd"> 2 1 3 5</span> |
| <span class="sd"> 4 1 1 2</span> |
| <span class="sd"> 1 1 4 5</span> |
| <span class="sd"> 3 2 6 1</span> |
| <span class="sd"> 4 2 9 2</span> |
| <span class="sd"> 9 2 8 6</span> |
| <span class="sd"> 10 3 10 4</span> |
| <span class="sd"> 5 3 7 3</span> |
| <span class="sd"> 6 3 5 6</span> |
| |
| <span class="sd"> >>> df.groupby('a').head(2).sort_index()</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 2 1 3 5</span> |
| <span class="sd"> 3 2 6 1</span> |
| <span class="sd"> 4 2 9 2</span> |
| <span class="sd"> 5 3 7 3</span> |
| <span class="sd"> 7 1 2 3</span> |
| <span class="sd"> 10 3 10 4</span> |
| |
| <span class="sd"> >>> df.groupby('a')['b'].head(2).sort_index()</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> 3 6</span> |
| <span class="sd"> 4 9</span> |
| <span class="sd"> 5 7</span> |
| <span class="sd"> 7 2</span> |
| <span class="sd"> 10 10</span> |
| <span class="sd"> Name: b, dtype: int64</span> |
| |
| <span class="sd"> Supports Groupby positional indexing Since pandas on Spark 3.4 (with pandas 1.4+):</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame([["g", "g0"],</span> |
| <span class="sd"> ... ["g", "g1"],</span> |
| <span class="sd"> ... ["g", "g2"],</span> |
| <span class="sd"> ... ["g", "g3"],</span> |
| <span class="sd"> ... ["h", "h0"],</span> |
| <span class="sd"> ... ["h", "h1"]], columns=["A", "B"])</span> |
| <span class="sd"> >>> df.groupby("A").head(-1) # doctest: +SKIP</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 g g0</span> |
| <span class="sd"> 1 g g1</span> |
| <span class="sd"> 2 g g2</span> |
| <span class="sd"> 4 h h0</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_limit</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="n">asc</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.tail"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.tail.html#pyspark.pandas.groupby.GroupBy.tail">[docs]</a> <span class="k">def</span> <span class="nf">tail</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return last n rows of each group.</span> |
| |
| <span class="sd"> Similar to `.apply(lambda x: x.tail(n))`, but it returns a subset of rows from</span> |
| <span class="sd"> the original DataFrame with original index and order preserved (`as_index` flag is ignored).</span> |
| |
| <span class="sd"> Does not work for negative values of n.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame or Series</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 1, 1, 1, 2, 2, 2, 3, 3, 3],</span> |
| <span class="sd"> ... 'b': [2, 3, 1, 4, 6, 9, 8, 10, 7, 5],</span> |
| <span class="sd"> ... 'c': [3, 5, 2, 5, 1, 2, 6, 4, 3, 6]},</span> |
| <span class="sd"> ... columns=['a', 'b', 'c'],</span> |
| <span class="sd"> ... index=[7, 2, 3, 1, 3, 4, 9, 10, 5, 6])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 7 1 2 3</span> |
| <span class="sd"> 2 1 3 5</span> |
| <span class="sd"> 3 1 1 2</span> |
| <span class="sd"> 1 1 4 5</span> |
| <span class="sd"> 3 2 6 1</span> |
| <span class="sd"> 4 2 9 2</span> |
| <span class="sd"> 9 2 8 6</span> |
| <span class="sd"> 10 3 10 4</span> |
| <span class="sd"> 5 3 7 3</span> |
| <span class="sd"> 6 3 5 6</span> |
| |
| <span class="sd"> >>> df.groupby('a').tail(2).sort_index()</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 1 1 4 5</span> |
| <span class="sd"> 3 1 1 2</span> |
| <span class="sd"> 4 2 9 2</span> |
| <span class="sd"> 5 3 7 3</span> |
| <span class="sd"> 6 3 5 6</span> |
| <span class="sd"> 9 2 8 6</span> |
| |
| <span class="sd"> >>> df.groupby('a')['b'].tail(2).sort_index()</span> |
| <span class="sd"> 1 4</span> |
| <span class="sd"> 3 1</span> |
| <span class="sd"> 4 9</span> |
| <span class="sd"> 5 7</span> |
| <span class="sd"> 6 5</span> |
| <span class="sd"> 9 8</span> |
| <span class="sd"> Name: b, dtype: int64</span> |
| |
| <span class="sd"> Supports Groupby positional indexing Since pandas on Spark 3.4 (with pandas 1.4+):</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame([["g", "g0"],</span> |
| <span class="sd"> ... ["g", "g1"],</span> |
| <span class="sd"> ... ["g", "g2"],</span> |
| <span class="sd"> ... ["g", "g3"],</span> |
| <span class="sd"> ... ["h", "h0"],</span> |
| <span class="sd"> ... ["h", "h1"]], columns=["A", "B"])</span> |
| <span class="sd"> >>> df.groupby("A").tail(-1) # doctest: +SKIP</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 3 g g3</span> |
| <span class="sd"> 2 g g2</span> |
| <span class="sd"> 1 g g1</span> |
| <span class="sd"> 5 h h1</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_limit</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="n">asc</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.shift"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.shift.html#pyspark.pandas.groupby.GroupBy.shift">[docs]</a> <span class="k">def</span> <span class="nf">shift</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">fill_value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Shift each group by periods observations.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> periods : integer, default 1</span> |
| <span class="sd"> number of periods to shift</span> |
| <span class="sd"> fill_value : optional</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| <span class="sd"> Object shifted within each group.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({</span> |
| <span class="sd"> ... 'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],</span> |
| <span class="sd"> ... 'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 1 1</span> |
| <span class="sd"> 1 1 2</span> |
| <span class="sd"> 2 1 2</span> |
| <span class="sd"> 3 2 2</span> |
| <span class="sd"> 4 2 3</span> |
| <span class="sd"> 5 2 3</span> |
| <span class="sd"> 6 3 3</span> |
| <span class="sd"> 7 3 4</span> |
| <span class="sd"> 8 3 4</span> |
| |
| <span class="sd"> >>> df.groupby('a').shift().sort_index() # doctest: +SKIP</span> |
| <span class="sd"> b</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> 1 1.0</span> |
| <span class="sd"> 2 2.0</span> |
| <span class="sd"> 3 NaN</span> |
| <span class="sd"> 4 2.0</span> |
| <span class="sd"> 5 3.0</span> |
| <span class="sd"> 6 NaN</span> |
| <span class="sd"> 7 3.0</span> |
| <span class="sd"> 8 4.0</span> |
| |
| <span class="sd"> >>> df.groupby('a').shift(periods=-1, fill_value=0).sort_index() # doctest: +SKIP</span> |
| <span class="sd"> b</span> |
| <span class="sd"> 0 2</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 0</span> |
| <span class="sd"> 3 3</span> |
| <span class="sd"> 4 3</span> |
| <span class="sd"> 5 0</span> |
| <span class="sd"> 6 4</span> |
| <span class="sd"> 7 4</span> |
| <span class="sd"> 8 0</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_shift</span><span class="p">(</span><span class="n">periods</span><span class="p">,</span> <span class="n">fill_value</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span> |
| <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.transform"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.transform.html#pyspark.pandas.groupby.GroupBy.transform">[docs]</a> <span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">],</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Apply function column-by-column to the GroupBy object.</span> |
| |
| <span class="sd"> The function passed to `transform` must take a Series as its first</span> |
| <span class="sd"> argument and return a Series. The given function is executed for</span> |
| <span class="sd"> each series in each grouped data.</span> |
| |
| <span class="sd"> While `transform` is a very flexible method, its downside is that</span> |
| <span class="sd"> using it can be quite a bit slower than using more specific methods</span> |
| <span class="sd"> like `agg` or `transform`. pandas-on-Spark offers a wide range of method that will</span> |
| <span class="sd"> be much faster than using `transform` for their specific purposes, so try to</span> |
| <span class="sd"> use them before reaching for `transform`.</span> |
| |
| <span class="sd"> .. note:: this API executes the function once to infer the type which is</span> |
| <span class="sd"> potentially expensive, for instance, when the dataset is created after</span> |
| <span class="sd"> aggregations or sorting.</span> |
| |
| <span class="sd"> To avoid this, specify return type in ``func``, for instance, as below:</span> |
| |
| <span class="sd"> >>> def convert_to_string(x) -> ps.Series[str]:</span> |
| <span class="sd"> ... return x.apply("a string {}".format)</span> |
| |
| <span class="sd"> When the given function has the return type annotated, the original index of the</span> |
| <span class="sd"> GroupBy object will be lost, and a default index will be attached to the result.</span> |
| <span class="sd"> Please be careful about configuring the default index. See also `Default Index Type</span> |
| <span class="sd"> <https://spark.apache.org/docs/latest/api/python/user_guide/pandas_on_spark/options.html#default-index-type>`_.</span> |
| |
| <span class="sd"> .. note:: the series within ``func`` is actually a pandas series. Therefore,</span> |
| <span class="sd"> any pandas API within this function is allowed.</span> |
| |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> func : callable</span> |
| <span class="sd"> A callable that takes a Series as its first argument, and</span> |
| <span class="sd"> returns a Series.</span> |
| <span class="sd"> *args</span> |
| <span class="sd"> Positional arguments to pass to func.</span> |
| <span class="sd"> **kwargs</span> |
| <span class="sd"> Keyword arguments to pass to func.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> applied : DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> aggregate : Apply aggregate function to the GroupBy object.</span> |
| <span class="sd"> Series.apply : Apply a function to a Series.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'A': [0, 0, 1],</span> |
| <span class="sd"> ... 'B': [1, 2, 3],</span> |
| <span class="sd"> ... 'C': [4, 6, 5]}, columns=['A', 'B', 'C'])</span> |
| |
| <span class="sd"> >>> g = df.groupby('A')</span> |
| |
| <span class="sd"> Notice that ``g`` has two groups, ``0`` and ``1``.</span> |
| <span class="sd"> Calling `transform` in various ways, we can get different grouping results:</span> |
| <span class="sd"> Below the functions passed to `transform` takes a Series as</span> |
| <span class="sd"> its argument and returns a Series. `transform` applies the function on each series</span> |
| <span class="sd"> in each grouped data, and combine them into a new DataFrame:</span> |
| |
| <span class="sd"> >>> def convert_to_string(x) -> ps.Series[str]:</span> |
| <span class="sd"> ... return x.apply("a string {}".format)</span> |
| <span class="sd"> >>> g.transform(convert_to_string) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> 0 a string 1 a string 4</span> |
| <span class="sd"> 1 a string 2 a string 6</span> |
| <span class="sd"> 2 a string 3 a string 5</span> |
| |
| <span class="sd"> >>> def plus_max(x) -> ps.Series[int]:</span> |
| <span class="sd"> ... return x + x.max()</span> |
| <span class="sd"> >>> g.transform(plus_max) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> 0 3 10</span> |
| <span class="sd"> 1 4 12</span> |
| <span class="sd"> 2 6 10</span> |
| |
| <span class="sd"> You can omit the type hint and let pandas-on-Spark infer its type.</span> |
| |
| <span class="sd"> >>> def plus_min(x):</span> |
| <span class="sd"> ... return x + x.min()</span> |
| <span class="sd"> >>> g.transform(plus_min) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> 0 2 8</span> |
| <span class="sd"> 1 3 10</span> |
| <span class="sd"> 2 6 10</span> |
| |
| <span class="sd"> In case of Series, it works as below.</span> |
| |
| <span class="sd"> >>> df.B.groupby(df.A).transform(plus_max)</span> |
| <span class="sd"> 0 3</span> |
| <span class="sd"> 1 4</span> |
| <span class="sd"> 2 6</span> |
| <span class="sd"> Name: B, dtype: int64</span> |
| |
| <span class="sd"> >>> (df * -1).B.groupby(df.A).transform(abs)</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> Name: B, dtype: int64</span> |
| |
| <span class="sd"> You can also specify extra arguments to pass to the function.</span> |
| |
| <span class="sd"> >>> def calculation(x, y, z) -> ps.Series[int]:</span> |
| <span class="sd"> ... return x + x.min() + y + z</span> |
| <span class="sd"> >>> g.transform(calculation, 5, z=20) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> 0 27 33</span> |
| <span class="sd"> 1 28 35</span> |
| <span class="sd"> 2 31 35</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">callable</span><span class="p">(</span><span class="n">func</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"</span><span class="si">%s</span><span class="s2"> object is not callable"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| |
| <span class="n">spec</span> <span class="o">=</span> <span class="n">inspect</span><span class="o">.</span><span class="n">getfullargspec</span><span class="p">(</span><span class="n">func</span><span class="p">)</span> |
| <span class="n">return_sig</span> <span class="o">=</span> <span class="n">spec</span><span class="o">.</span><span class="n">annotations</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"return"</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span> |
| |
| <span class="n">psdf</span><span class="p">,</span> <span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">groupkey_names</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_prepare_group_map_apply</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">agg_columns</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">pandas_transform</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <span class="n">should_infer_schema</span> <span class="o">=</span> <span class="n">return_sig</span> <span class="ow">is</span> <span class="kc">None</span> |
| |
| <span class="k">if</span> <span class="n">should_infer_schema</span><span class="p">:</span> |
| <span class="c1"># Here we execute with the first 1000 to get the return type.</span> |
| <span class="c1"># If the records were less than 1000, it uses pandas API directly for a shortcut.</span> |
| <span class="n">log_advice</span><span class="p">(</span> |
| <span class="s2">"If the type hints is not specified for `groupby.transform`, "</span> |
| <span class="s2">"it is expensive to infer the data type internally."</span> |
| <span class="p">)</span> |
| <span class="n">limit</span> <span class="o">=</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">"compute.shortcut_limit"</span><span class="p">)</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">limit</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| <span class="n">psdf_from_pandas</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> |
| <span class="n">return_schema</span> <span class="o">=</span> <span class="n">force_decimal_precision_scale</span><span class="p">(</span> |
| <span class="n">as_nullable_spark_type</span><span class="p">(</span> |
| <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="o">*</span><span class="n">HIDDEN_COLUMNS</span><span class="p">)</span><span class="o">.</span><span class="n">schema</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="o"><=</span> <span class="n">limit</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_handle_output</span><span class="p">(</span><span class="n">psdf_from_pandas</span><span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_group_map_apply</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">,</span> |
| <span class="n">pandas_transform</span><span class="p">,</span> |
| <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">],</span> |
| <span class="n">return_schema</span><span class="p">,</span> |
| <span class="n">retain_index</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="c1"># If schema is inferred, we can restore indexes too.</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span> |
| <span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">nullable</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span> |
| <span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">nullable</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span> |
| <span class="p">],</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">return_type</span> <span class="o">=</span> <span class="n">infer_return_type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">return_type</span><span class="p">,</span> <span class="n">SeriesType</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Expected the return type of this function to be of Series type, "</span> |
| <span class="s2">"but found type </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">return_type</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">dtype</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">dtype</span> |
| <span class="n">spark_type</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">spark_type</span> |
| |
| <span class="n">data_fields</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">InternalField</span><span class="p">(</span><span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">struct_field</span><span class="o">=</span><span class="n">StructField</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">c</span><span class="p">,</span> <span class="n">dataType</span><span class="o">=</span><span class="n">spark_type</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span> |
| <span class="k">if</span> <span class="n">c</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">groupkey_names</span> |
| <span class="p">]</span> |
| |
| <span class="n">return_schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span><span class="n">field</span><span class="o">.</span><span class="n">struct_field</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">data_fields</span><span class="p">])</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_group_map_apply</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">,</span> |
| <span class="n">pandas_transform</span><span class="p">,</span> |
| <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">],</span> |
| <span class="n">return_schema</span><span class="p">,</span> |
| <span class="n">retain_index</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="c1"># Otherwise, it loses index.</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_handle_output</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.nunique"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.nunique.html#pyspark.pandas.groupby.GroupBy.nunique">[docs]</a> <span class="k">def</span> <span class="nf">nunique</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return DataFrame with number of distinct observations per group for each column.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> dropna : boolean, default True</span> |
| <span class="sd"> Don’t include NaN in the counts.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> nunique : DataFrame or Series</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',</span> |
| <span class="sd"> ... 'ham', 'ham'],</span> |
| <span class="sd"> ... 'value1': [1, 5, 5, 2, 5, 5],</span> |
| <span class="sd"> ... 'value2': list('abbaxy')}, columns=['id', 'value1', 'value2'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> id value1 value2</span> |
| <span class="sd"> 0 spam 1 a</span> |
| <span class="sd"> 1 egg 5 b</span> |
| <span class="sd"> 2 egg 5 b</span> |
| <span class="sd"> 3 spam 2 a</span> |
| <span class="sd"> 4 ham 5 x</span> |
| <span class="sd"> 5 ham 5 y</span> |
| |
| <span class="sd"> >>> df.groupby('id').nunique().sort_index() # doctest: +SKIP</span> |
| <span class="sd"> value1 value2</span> |
| <span class="sd"> id</span> |
| <span class="sd"> egg 1 1</span> |
| <span class="sd"> ham 1 2</span> |
| <span class="sd"> spam 2 1</span> |
| |
| <span class="sd"> >>> df.groupby('id')['value1'].nunique().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> id</span> |
| <span class="sd"> egg 1</span> |
| <span class="sd"> ham 1</span> |
| <span class="sd"> spam 2</span> |
| <span class="sd"> Name: value1, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">dropna</span><span class="p">:</span> |
| |
| <span class="k">def</span> <span class="nf">stat_function</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">countDistinct</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| |
| <span class="k">def</span> <span class="nf">stat_function</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">countDistinct</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="o">+</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="kc">None</span><span class="p">))</span> <span class="o">>=</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">stat_function</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">rolling</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">window</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">min_periods</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"RollingGroupby[FrameLike]"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return an rolling grouper, providing rolling</span> |
| <span class="sd"> functionality per group.</span> |
| |
| <span class="sd"> .. note:: 'min_periods' in pandas-on-Spark works as a fixed window size unlike pandas.</span> |
| <span class="sd"> Unlike pandas, NA is also counted as the period. This might be changed</span> |
| <span class="sd"> soon.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> window : int, or offset</span> |
| <span class="sd"> Size of the moving window.</span> |
| <span class="sd"> This is the number of observations used for calculating the statistic.</span> |
| <span class="sd"> Each window will be a fixed size.</span> |
| |
| <span class="sd"> min_periods : int, default 1</span> |
| <span class="sd"> Minimum number of observations in window required to have a value</span> |
| <span class="sd"> (otherwise result is NA).</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.groupby</span> |
| <span class="sd"> DataFrame.groupby</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">RollingGroupby</span> |
| |
| <span class="k">return</span> <span class="n">RollingGroupby</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">window</span><span class="p">,</span> <span class="n">min_periods</span><span class="o">=</span><span class="n">min_periods</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">expanding</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">min_periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"ExpandingGroupby[FrameLike]"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return an expanding grouper, providing expanding</span> |
| <span class="sd"> functionality per group.</span> |
| |
| <span class="sd"> .. note:: 'min_periods' in pandas-on-Spark works as a fixed window size unlike pandas.</span> |
| <span class="sd"> Unlike pandas, NA is also counted as the period. This might be changed</span> |
| <span class="sd"> soon.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> min_periods : int, default 1</span> |
| <span class="sd"> Minimum number of observations in window required to have a value</span> |
| <span class="sd"> (otherwise result is NA).</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.groupby</span> |
| <span class="sd"> DataFrame.groupby</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">ExpandingGroupby</span> |
| |
| <span class="k">return</span> <span class="n">ExpandingGroupby</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">min_periods</span><span class="o">=</span><span class="n">min_periods</span><span class="p">)</span> |
| |
| <span class="c1"># TODO: 'adjust', 'axis', 'method' parameter should be implemented.</span> |
| <div class="viewcode-block" id="GroupBy.ewm"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.ewm.html#pyspark.pandas.groupby.GroupBy.ewm">[docs]</a> <span class="k">def</span> <span class="nf">ewm</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">com</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">span</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">halflife</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">alpha</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">min_periods</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">ignore_na</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"ExponentialMovingGroupby[FrameLike]"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return an ewm grouper, providing ewm functionality per group.</span> |
| |
| <span class="sd"> .. note:: 'min_periods' in pandas-on-Spark works as a fixed window size unlike pandas.</span> |
| <span class="sd"> Unlike pandas, NA is also counted as the period. This might be changed</span> |
| <span class="sd"> soon.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> com : float, optional</span> |
| <span class="sd"> Specify decay in terms of center of mass.</span> |
| <span class="sd"> alpha = 1 / (1 + com), for com >= 0.</span> |
| |
| <span class="sd"> span : float, optional</span> |
| <span class="sd"> Specify decay in terms of span.</span> |
| <span class="sd"> alpha = 2 / (span + 1), for span >= 1.</span> |
| |
| <span class="sd"> halflife : float, optional</span> |
| <span class="sd"> Specify decay in terms of half-life.</span> |
| <span class="sd"> alpha = 1 - exp(-ln(2) / halflife), for halflife > 0.</span> |
| |
| <span class="sd"> alpha : float, optional</span> |
| <span class="sd"> Specify smoothing factor alpha directly.</span> |
| <span class="sd"> 0 < alpha <= 1.</span> |
| |
| <span class="sd"> min_periods : int, default None</span> |
| <span class="sd"> Minimum number of observations in window required to have a value</span> |
| <span class="sd"> (otherwise result is NA).</span> |
| |
| <span class="sd"> ignore_na : bool, default False</span> |
| <span class="sd"> Ignore missing values when calculating weights.</span> |
| |
| <span class="sd"> - When ``ignore_na=False`` (default), weights are based on absolute positions.</span> |
| <span class="sd"> For example, the weights of :math:`x_0` and :math:`x_2` used in calculating</span> |
| <span class="sd"> the final weighted average of [:math:`x_0`, None, :math:`x_2`] are</span> |
| <span class="sd"> :math:`(1-\alpha)^2` and :math:`1` if ``adjust=True``, and</span> |
| <span class="sd"> :math:`(1-\alpha)^2` and :math:`\alpha` if ``adjust=False``.</span> |
| |
| <span class="sd"> - When ``ignore_na=True``, weights are based</span> |
| <span class="sd"> on relative positions. For example, the weights of :math:`x_0` and :math:`x_2`</span> |
| <span class="sd"> used in calculating the final weighted average of</span> |
| <span class="sd"> [:math:`x_0`, None, :math:`x_2`] are :math:`1-\alpha` and :math:`1` if</span> |
| <span class="sd"> ``adjust=True``, and :math:`1-\alpha` and :math:`\alpha` if ``adjust=False``.</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">ExponentialMovingGroupby</span> |
| |
| <span class="k">return</span> <span class="n">ExponentialMovingGroupby</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">com</span><span class="o">=</span><span class="n">com</span><span class="p">,</span> |
| <span class="n">span</span><span class="o">=</span><span class="n">span</span><span class="p">,</span> |
| <span class="n">halflife</span><span class="o">=</span><span class="n">halflife</span><span class="p">,</span> |
| <span class="n">alpha</span><span class="o">=</span><span class="n">alpha</span><span class="p">,</span> |
| <span class="n">min_periods</span><span class="o">=</span><span class="n">min_periods</span><span class="p">,</span> |
| <span class="n">ignore_na</span><span class="o">=</span><span class="n">ignore_na</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.get_group"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.get_group.html#pyspark.pandas.groupby.GroupBy.get_group">[docs]</a> <span class="k">def</span> <span class="nf">get_group</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]])</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Construct DataFrame from group with provided name.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> name : object</span> |
| <span class="sd"> The name of the group to get as a DataFrame.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> group : same type as obj</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psdf = ps.DataFrame([('falcon', 'bird', 389.0),</span> |
| <span class="sd"> ... ('parrot', 'bird', 24.0),</span> |
| <span class="sd"> ... ('lion', 'mammal', 80.5),</span> |
| <span class="sd"> ... ('monkey', 'mammal', np.nan)],</span> |
| <span class="sd"> ... columns=['name', 'class', 'max_speed'],</span> |
| <span class="sd"> ... index=[0, 2, 3, 1])</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> name class max_speed</span> |
| <span class="sd"> 0 falcon bird 389.0</span> |
| <span class="sd"> 2 parrot bird 24.0</span> |
| <span class="sd"> 3 lion mammal 80.5</span> |
| <span class="sd"> 1 monkey mammal NaN</span> |
| |
| <span class="sd"> >>> psdf.groupby("class").get_group("bird").sort_index()</span> |
| <span class="sd"> name class max_speed</span> |
| <span class="sd"> 0 falcon bird 389.0</span> |
| <span class="sd"> 2 parrot bird 24.0</span> |
| |
| <span class="sd"> >>> psdf.groupby("class").get_group("mammal").sort_index()</span> |
| <span class="sd"> name class max_speed</span> |
| <span class="sd"> 1 monkey mammal NaN</span> |
| <span class="sd"> 3 lion mammal 80.5</span> |
| <span class="sd"> """</span> |
| <span class="n">groupkeys</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_hashable</span><span class="p">(</span><span class="n">name</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"unhashable type: '</span><span class="si">{}</span><span class="s2">'"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">name</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span> |
| <span class="k">elif</span> <span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"must supply a tuple to get_group with multiple grouping keys"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">name</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"must supply a same-length tuple to get_group with multiple grouping keys"</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">name</span><span class="p">):</span> |
| <span class="n">name</span> <span class="o">=</span> <span class="p">[</span><span class="n">name</span><span class="p">]</span> |
| <span class="n">cond</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">True</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">groupkey</span><span class="p">,</span> <span class="n">item</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">name</span><span class="p">):</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">groupkey</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="n">cond</span> <span class="o">=</span> <span class="n">cond</span> <span class="o">&</span> <span class="p">(</span><span class="n">scol</span> <span class="o">==</span> <span class="n">item</span><span class="p">)</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span><span class="p">:</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span> |
| <span class="n">spark_frame</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_columns</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_scols</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">spark_frame</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">spark_frame</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="p">],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">spark_frame</span><span class="p">,</span> <span class="n">s</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> |
| <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> |
| <span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_filter</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">head</span><span class="p">()</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_handle_output</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.median"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.median.html#pyspark.pandas.groupby.GroupBy.median">[docs]</a> <span class="k">def</span> <span class="nf">median</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10000</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Compute median of groups, excluding missing values.</span> |
| |
| <span class="sd"> For multiple groupings, the result index will be a MultiIndex</span> |
| |
| <span class="sd"> .. note:: Unlike pandas', the median in pandas-on-Spark is an approximated median based upon</span> |
| <span class="sd"> approximate percentile computation because computing median across a large dataset</span> |
| <span class="sd"> is extremely expensive.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> numeric_only : bool, default False</span> |
| <span class="sd"> Include only float, int, boolean columns.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| <span class="sd"> .. versionchanged:: 4.0.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| <span class="sd"> Median of values within each group.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psdf = ps.DataFrame({'a': [1., 1., 1., 1., 2., 2., 2., 3., 3., 3.],</span> |
| <span class="sd"> ... 'b': [2., 3., 1., 4., 6., 9., 8., 10., 7., 5.],</span> |
| <span class="sd"> ... 'c': [3., 5., 2., 5., 1., 2., 6., 4., 3., 6.]},</span> |
| <span class="sd"> ... columns=['a', 'b', 'c'],</span> |
| <span class="sd"> ... index=[7, 2, 4, 1, 3, 4, 9, 10, 5, 6])</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 7 1.0 2.0 3.0</span> |
| <span class="sd"> 2 1.0 3.0 5.0</span> |
| <span class="sd"> 4 1.0 1.0 2.0</span> |
| <span class="sd"> 1 1.0 4.0 5.0</span> |
| <span class="sd"> 3 2.0 6.0 1.0</span> |
| <span class="sd"> 4 2.0 9.0 2.0</span> |
| <span class="sd"> 9 2.0 8.0 6.0</span> |
| <span class="sd"> 10 3.0 10.0 4.0</span> |
| <span class="sd"> 5 3.0 7.0 3.0</span> |
| <span class="sd"> 6 3.0 5.0 6.0</span> |
| |
| <span class="sd"> DataFrameGroupBy</span> |
| |
| <span class="sd"> >>> psdf.groupby('a').median().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> b c</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1.0 2.0 3.0</span> |
| <span class="sd"> 2.0 8.0 2.0</span> |
| <span class="sd"> 3.0 7.0 4.0</span> |
| |
| <span class="sd"> SeriesGroupBy</span> |
| |
| <span class="sd"> >>> psdf.groupby('a')['b'].median().sort_index()</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1.0 2.0</span> |
| <span class="sd"> 2.0 8.0</span> |
| <span class="sd"> 3.0 7.0</span> |
| <span class="sd"> Name: b, dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">accuracy</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"accuracy must be an integer; however, got [</span><span class="si">%s</span><span class="s2">]"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">accuracy</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span> |
| <span class="p">)</span> |
| |
| <span class="bp">self</span><span class="o">.</span><span class="n">_validate_agg_columns</span><span class="p">(</span><span class="n">numeric_only</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">,</span> <span class="n">function_name</span><span class="o">=</span><span class="s2">"median"</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">stat_function</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">percentile_approx</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="mf">0.5</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">stat_function</span><span class="p">,</span> |
| <span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,),</span> |
| <span class="n">bool_to_numeric</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_validate_agg_columns</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">],</span> <span class="n">function_name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Validate aggregation columns and raise an error or a warning following pandas."""</span> |
| <span class="n">has_non_numeric</span> <span class="o">=</span> <span class="kc">False</span> |
| <span class="k">for</span> <span class="n">_agg_col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">_agg_col</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">)):</span> |
| <span class="n">has_non_numeric</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="k">break</span> |
| <span class="k">if</span> <span class="n">has_non_numeric</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">SeriesGroupBy</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"Only numeric aggregation column is accepted."</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">numeric_only</span> <span class="ow">and</span> <span class="n">has_non_numeric</span><span class="p">:</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span> |
| <span class="s2">"Dropping invalid columns in DataFrameGroupBy.</span><span class="si">%s</span><span class="s2"> is deprecated. "</span> |
| <span class="s2">"In a future version, a TypeError will be raised. "</span> |
| <span class="s2">"Before calling .</span><span class="si">%s</span><span class="s2">, select only columns which should be "</span> |
| <span class="s2">"valid for the function."</span> <span class="o">%</span> <span class="p">(</span><span class="n">function_name</span><span class="p">,</span> <span class="n">function_name</span><span class="p">),</span> |
| <span class="ne">FutureWarning</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">sfun</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span> |
| <span class="n">accepted_spark_types</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Type</span><span class="p">[</span><span class="n">DataType</span><span class="p">],</span> <span class="o">...</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">bool_to_numeric</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Apply an aggregate function `sfun` per column and reduce to a FrameLike.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> sfun : The aggregate function to apply per column.</span> |
| <span class="sd"> accepted_spark_types: Accepted spark types of columns to be aggregated;</span> |
| <span class="sd"> default None means all spark types are accepted.</span> |
| <span class="sd"> bool_to_numeric: If True, boolean columns are converted to numeric columns, which</span> |
| <span class="sd"> are accepted for all statistical functions regardless of</span> |
| <span class="sd"> `accepted_spark_types`.</span> |
| <span class="sd"> """</span> |
| <span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span> |
| <span class="n">internal</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prepare_reduce</span><span class="p">(</span> |
| <span class="n">groupkey_names</span><span class="p">,</span> <span class="n">accepted_spark_types</span><span class="p">,</span> <span class="n">bool_to_numeric</span> |
| <span class="p">)</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">min_count</span> <span class="o">=</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"min_count"</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span> |
| <span class="n">stat_exprs</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="n">input_scol</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">nan_to_null</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="k">if</span> <span class="n">sfun</span><span class="o">.</span><span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"sum"</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_type_for</span><span class="p">(</span><span class="n">label</span><span class="p">),</span> <span class="n">StringType</span> |
| <span class="p">):</span> |
| <span class="n">input_scol_name</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="c1"># Sort data with natural order column to ensure order of data</span> |
| <span class="n">sorted_array</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">array_sort</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">collect_list</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">struct</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span> <span class="n">input_scol</span><span class="p">))</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># Using transform to extract strings</span> |
| <span class="n">output_scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">concat_ws</span><span class="p">(</span> |
| <span class="s2">""</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">sorted_array</span><span class="p">,</span> <span class="k">lambda</span> <span class="n">x</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">getField</span><span class="p">(</span><span class="n">input_scol_name</span><span class="p">))</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">output_scol</span> <span class="o">=</span> <span class="n">sfun</span><span class="p">(</span><span class="n">input_scol</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">min_count</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">output_scol</span> <span class="o">=</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="o">~</span><span class="n">F</span><span class="o">.</span><span class="n">isnull</span><span class="p">(</span><span class="n">input_scol</span><span class="p">),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">)))</span> <span class="o">>=</span> <span class="n">min_count</span><span class="p">,</span> <span class="n">output_scol</span> |
| <span class="p">)</span> |
| |
| <span class="n">stat_exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">output_scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]))</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">stat_exprs</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">distinct</span><span class="p">()</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prepare_return</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">_prepare_return</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">agg_column_names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span><span class="p">:</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span> |
| <span class="n">subset</span><span class="o">=</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span><span class="p">:</span> |
| <span class="n">column_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">column</span><span class="o">.</span><span class="n">name</span> <span class="k">for</span> <span class="n">column</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">]</span> |
| <span class="k">for</span> <span class="n">groupkey</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">groupkey</span><span class="o">.</span><span class="n">name</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">column_names</span><span class="p">:</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span> |
| <span class="s2">"A grouping was used that is not in the columns of the DataFrame and so "</span> |
| <span class="s2">"was excluded from the result. "</span> |
| <span class="s2">"This grouping will be included in a future version. "</span> |
| <span class="s2">"Add the grouping as a column of the DataFrame to silence this warning."</span><span class="p">,</span> |
| <span class="ne">FutureWarning</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">should_drop_index</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span> |
| <span class="n">i</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">gkey</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span> <span class="k">if</span> <span class="n">gkey</span><span class="o">.</span><span class="n">_psdf</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">should_drop_index</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">level</span><span class="o">=</span><span class="n">should_drop_index</span><span class="p">,</span> <span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">should_drop_index</span><span class="p">)</span> <span class="o"><</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">):</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_handle_output</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">agg_column_names</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">_prepare_reduce</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">groupkey_names</span><span class="p">:</span> <span class="n">List</span><span class="p">,</span> |
| <span class="n">accepted_spark_types</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Type</span><span class="p">[</span><span class="n">DataType</span><span class="p">],</span> <span class="o">...</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">bool_to_numeric</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">InternalFrame</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span> <span class="n">SparkDataFrame</span><span class="p">]:</span> |
| <span class="n">groupkey_scols</span> <span class="o">=</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)]</span> |
| <span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">bool_to_numeric</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">):</span> |
| <span class="n">agg_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="nb">int</span><span class="p">))</span> |
| <span class="k">elif</span> <span class="p">(</span><span class="n">accepted_spark_types</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">)</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">accepted_spark_types</span> |
| <span class="p">):</span> |
| <span class="n">agg_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="o">*</span><span class="n">groupkey_scols</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">],</span> |
| <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span> |
| <span class="p">],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">],</span> |
| <span class="n">column_label_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">internal</span><span class="p">,</span> <span class="n">agg_columns</span><span class="p">,</span> <span class="n">sdf</span> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_resolve_grouping_from_diff_dataframes</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">Label</span><span class="p">]]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span> <span class="n">Set</span><span class="p">[</span><span class="n">Label</span><span class="p">]]:</span> |
| <span class="n">column_labels_level</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> |
| |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">additional_pssers</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">additional_column_labels</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">tmp_column_labels</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">col_or_s</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">by</span><span class="p">):</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">col_or_s</span><span class="o">.</span><span class="n">_psdf</span> <span class="ow">is</span> <span class="n">psdf</span><span class="p">:</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col_or_s</span><span class="o">.</span><span class="n">_column_label</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="n">psdf</span><span class="p">):</span> |
| <span class="n">temp_label</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="s2">"__tmp_groupkey_</span><span class="si">{}</span><span class="s2">__"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">temp_label</span><span class="p">)</span> |
| <span class="n">additional_pssers</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col_or_s</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">temp_label</span><span class="p">))</span> |
| <span class="n">additional_column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">temp_label</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">temp_label</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">,</span> |
| <span class="nb">tuple</span><span class="p">(</span> |
| <span class="p">([</span><span class="s2">""</span><span class="p">]</span> <span class="o">*</span> <span class="p">(</span><span class="n">column_labels_level</span> <span class="o">-</span> <span class="mi">1</span><span class="p">))</span> <span class="o">+</span> <span class="p">[</span><span class="s2">"__tmp_groupkey_</span><span class="si">{}</span><span class="s2">__"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">)]</span> |
| <span class="p">),</span> |
| <span class="p">)</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">temp_label</span><span class="p">)</span> |
| <span class="n">tmp_column_labels</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">temp_label</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">col_or_s</span><span class="p">]</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">))</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">]</span> |
| <span class="o">+</span> <span class="n">additional_pssers</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">assign_columns</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">this_column_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span> <span class="n">that_column_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">Label</span><span class="p">]]:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span> |
| <span class="s2">"Duplicated labels with groupby() and "</span> |
| <span class="s2">"'compute.ops_on_diff_frames' option is not supported currently "</span> |
| <span class="s2">"Please use unique labels in series and frames."</span> |
| <span class="p">)</span> |
| |
| <span class="k">for</span> <span class="n">col_or_s</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">by</span><span class="p">,</span> <span class="n">column_labels</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">tmp_column_labels</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">col_or_s</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">align_diff_frames</span><span class="p">(</span> |
| <span class="n">assign_columns</span><span class="p">,</span> |
| <span class="n">psdf</span><span class="p">,</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">label</span><span class="p">),</span> |
| <span class="n">fillna</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> |
| <span class="n">how</span><span class="o">=</span><span class="s2">"inner"</span><span class="p">,</span> |
| <span class="n">preserve_order_column</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">tmp_column_labels</span> <span class="o">|=</span> <span class="nb">set</span><span class="p">(</span><span class="n">additional_column_labels</span><span class="p">)</span> |
| |
| <span class="n">new_by_series</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">col_or_s</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">by</span><span class="p">,</span> <span class="n">column_labels</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">tmp_column_labels</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">col_or_s</span> |
| <span class="n">new_by_series</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">name</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">new_by_series</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| |
| <span class="k">return</span> <span class="n">psdf</span><span class="p">,</span> <span class="n">new_by_series</span><span class="p">,</span> <span class="n">tmp_column_labels</span> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_resolve_grouping</span><span class="p">(</span><span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">Label</span><span class="p">]])</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">]:</span> |
| <span class="n">new_by_series</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">col_or_s</span> <span class="ow">in</span> <span class="n">by</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="n">new_by_series</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">col_or_s</span><span class="p">]</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">))</span> |
| <span class="n">new_by_series</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">new_by_series</span> |
| |
| |
| <span class="k">class</span> <span class="nc">DataFrameGroupBy</span><span class="p">(</span><span class="n">GroupBy</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">]):</span> |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_build</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">Label</span><span class="p">]],</span> <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrameGroupBy"</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">col_or_s</span><span class="p">)</span> <span class="k">for</span> <span class="n">col_or_s</span> <span class="ow">in</span> <span class="n">by</span><span class="p">):</span> |
| <span class="p">(</span> |
| <span class="n">psdf</span><span class="p">,</span> |
| <span class="n">new_by_series</span><span class="p">,</span> |
| <span class="n">column_labels_to_exclude</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_resolve_grouping_from_diff_dataframes</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">by</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">new_by_series</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_resolve_grouping</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">by</span><span class="p">)</span> |
| <span class="n">column_labels_to_exclude</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span> |
| <span class="k">return</span> <span class="n">DataFrameGroupBy</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">,</span> |
| <span class="n">new_by_series</span><span class="p">,</span> |
| <span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span> |
| <span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">,</span> |
| <span class="n">column_labels_to_exclude</span><span class="o">=</span><span class="n">column_labels_to_exclude</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> |
| <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span> |
| <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> |
| <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> |
| <span class="n">column_labels_to_exclude</span><span class="p">:</span> <span class="n">Set</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span> |
| <span class="n">agg_columns</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="n">agg_columns_selected</span> <span class="o">=</span> <span class="n">agg_columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| <span class="k">if</span> <span class="n">agg_columns_selected</span><span class="p">:</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels_to_exclude</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">label</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">any</span><span class="p">(</span><span class="n">label</span> <span class="o">==</span> <span class="n">key</span><span class="o">.</span><span class="n">_column_label</span> <span class="ow">and</span> <span class="n">key</span><span class="o">.</span><span class="n">_psdf</span> <span class="ow">is</span> <span class="n">psdf</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">by</span><span class="p">)</span> |
| <span class="ow">and</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">column_labels_to_exclude</span> |
| <span class="p">]</span> |
| |
| <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">=</span><span class="n">psdf</span><span class="p">,</span> |
| <span class="n">groupkeys</span><span class="o">=</span><span class="n">by</span><span class="p">,</span> |
| <span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span> |
| <span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">,</span> |
| <span class="n">column_labels_to_exclude</span><span class="o">=</span><span class="n">column_labels_to_exclude</span><span class="p">,</span> |
| <span class="n">agg_columns_selected</span><span class="o">=</span><span class="n">agg_columns_selected</span><span class="p">,</span> |
| <span class="n">agg_columns</span><span class="o">=</span><span class="p">[</span><span class="n">psdf</span><span class="p">[</span><span class="n">label</span><span class="p">]</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">],</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__getattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Any</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">MissingPandasLikeDataFrameGroupBy</span><span class="p">,</span> <span class="n">item</span><span class="p">):</span> |
| <span class="n">property_or_func</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">MissingPandasLikeDataFrameGroupBy</span><span class="p">,</span> <span class="n">item</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">property_or_func</span><span class="p">,</span> <span class="nb">property</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">property_or_func</span><span class="o">.</span><span class="n">fget</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">partial</span><span class="p">(</span><span class="n">property_or_func</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="fm">__getitem__</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">GroupBy</span><span class="p">:</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span> <span class="ow">and</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">item</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">SeriesGroupBy</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">item</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">item</span><span class="p">,)),</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> |
| <span class="n">dropna</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">item</span><span class="p">):</span> |
| <span class="n">item</span> <span class="o">=</span> <span class="p">[</span><span class="n">item</span><span class="p">]</span> |
| <span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">item</span><span class="p">):</span> |
| <span class="n">item</span> <span class="o">=</span> <span class="p">[(</span><span class="n">item</span><span class="p">,)]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">item</span> <span class="o">=</span> <span class="p">[</span><span class="n">i</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">i</span><span class="p">,)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">item</span><span class="p">]</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span><span class="p">:</span> |
| <span class="n">groupkey_names</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">key</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">item</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"cannot insert </span><span class="si">{}</span><span class="s2">, already exists"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">name</span><span class="p">))</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrameGroupBy</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> |
| <span class="n">as_index</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span><span class="p">,</span> |
| <span class="n">dropna</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span><span class="p">,</span> |
| <span class="n">column_labels_to_exclude</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_column_labels_to_exclude</span><span class="p">,</span> |
| <span class="n">agg_columns</span><span class="o">=</span><span class="n">item</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">_apply_series_op</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">op</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">"SeriesGroupBy"</span><span class="p">],</span> <span class="n">Series</span><span class="p">],</span> |
| <span class="n">should_resolve</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="n">applied</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">column</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">:</span> |
| <span class="n">applied</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">(</span><span class="n">column</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span> |
| <span class="k">if</span> <span class="n">numeric_only</span><span class="p">:</span> |
| <span class="n">applied</span> <span class="o">=</span> <span class="p">[</span><span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">applied</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">)]</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">applied</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="n">DataError</span><span class="p">(</span><span class="s2">"No numeric types to aggregate"</span><span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span><span class="n">applied</span><span class="p">,</span> <span class="n">keep_order</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">should_resolve</span><span class="p">:</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">resolved_copy</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">_handle_output</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">agg_column_names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">agg_column_names</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psdf</span><span class="p">[</span><span class="n">agg_column_names</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psdf</span> |
| |
| <span class="c1"># TODO: Implement 'percentiles', 'include', and 'exclude' arguments.</span> |
| <span class="c1"># TODO: Add ``DataFrame.select_dtypes`` to See Also when 'include'</span> |
| <span class="c1"># and 'exclude' arguments are implemented.</span> |
| <div class="viewcode-block" id="DataFrameGroupBy.describe"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.DataFrameGroupBy.describe.html#pyspark.pandas.groupby.DataFrameGroupBy.describe">[docs]</a> <span class="k">def</span> <span class="nf">describe</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Generate descriptive statistics that summarize the central tendency,</span> |
| <span class="sd"> dispersion and shape of a dataset's distribution, excluding</span> |
| <span class="sd"> ``NaN`` values.</span> |
| |
| <span class="sd"> Analyzes both numeric and object series, as well</span> |
| <span class="sd"> as ``DataFrame`` column sets of mixed data types. The output</span> |
| <span class="sd"> will vary depending on what is provided. Refer to the notes</span> |
| <span class="sd"> below for more detail.</span> |
| |
| <span class="sd"> .. note:: Unlike pandas, the percentiles in pandas-on-Spark are based upon</span> |
| <span class="sd"> approximate percentile computation because computing percentiles</span> |
| <span class="sd"> across a large dataset is extremely expensive.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> Summary statistics of the DataFrame provided.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.count</span> |
| <span class="sd"> DataFrame.max</span> |
| <span class="sd"> DataFrame.min</span> |
| <span class="sd"> DataFrame.mean</span> |
| <span class="sd"> DataFrame.std</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 1, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 0 1 4 7</span> |
| <span class="sd"> 1 1 5 8</span> |
| <span class="sd"> 2 3 6 9</span> |
| |
| <span class="sd"> Describing a ``DataFrame``. By default only numeric fields</span> |
| <span class="sd"> are returned.</span> |
| |
| <span class="sd"> >>> described = df.groupby('a').describe()</span> |
| <span class="sd"> >>> described.sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> b c</span> |
| <span class="sd"> count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1 2.0 4.5 0.707107 4.0 4.0 4.0 5.0 5.0 2.0 7.5 0.707107 7.0 7.0 7.0 8.0 8.0</span> |
| <span class="sd"> 3 1.0 6.0 NaN 6.0 6.0 6.0 6.0 6.0 1.0 9.0 NaN 9.0 9.0 9.0 9.0 9.0</span> |
| |
| <span class="sd"> """</span> |
| <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">StringType</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span> |
| <span class="s2">"DataFrameGroupBy.describe() doesn't support for string type for now"</span> |
| <span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">aggregate</span><span class="p">([</span><span class="s2">"count"</span><span class="p">,</span> <span class="s2">"mean"</span><span class="p">,</span> <span class="s2">"std"</span><span class="p">,</span> <span class="s2">"min"</span><span class="p">,</span> <span class="s2">"quartiles"</span><span class="p">,</span> <span class="s2">"max"</span><span class="p">])</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="n">agg_column_labels</span> <span class="o">=</span> <span class="p">[</span><span class="n">col</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">]</span> |
| <span class="n">formatted_percentiles</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"25%"</span><span class="p">,</span> <span class="s2">"50%"</span><span class="p">,</span> <span class="s2">"75%"</span><span class="p">]</span> |
| |
| <span class="c1"># Split "quartiles" columns into first, second, and third quartiles.</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">agg_column_labels</span><span class="p">:</span> |
| <span class="n">quartiles_col</span> <span class="o">=</span> <span class="n">name_like_string</span><span class="p">(</span><span class="nb">tuple</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">+</span> <span class="p">[</span><span class="s2">"quartiles"</span><span class="p">]))</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">percentile</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">formatted_percentiles</span><span class="p">):</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span> |
| <span class="n">name_like_string</span><span class="p">(</span><span class="nb">tuple</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">+</span> <span class="p">[</span><span class="n">percentile</span><span class="p">])),</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">quartiles_col</span><span class="p">)[</span><span class="n">i</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">quartiles_col</span><span class="p">)</span> |
| |
| <span class="c1"># Reorder columns lexicographically by agg column followed by stats.</span> |
| <span class="n">stats</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"count"</span><span class="p">,</span> <span class="s2">"mean"</span><span class="p">,</span> <span class="s2">"std"</span><span class="p">,</span> <span class="s2">"min"</span><span class="p">]</span> <span class="o">+</span> <span class="n">formatted_percentiles</span> <span class="o">+</span> <span class="p">[</span><span class="s2">"max"</span><span class="p">]</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span><span class="nb">tuple</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">+</span> <span class="p">[</span><span class="n">s</span><span class="p">])</span> <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">product</span><span class="p">(</span><span class="n">agg_column_labels</span><span class="p">,</span> <span class="n">stats</span><span class="p">)]</span> |
| <span class="n">data_columns</span> <span class="o">=</span> <span class="nb">map</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">,</span> <span class="n">column_labels</span><span class="p">)</span> |
| |
| <span class="c1"># Reindex the DataFrame to reflect initial grouping and agg columns.</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">,</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_columns</span><span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># Cast columns to ``"float64"`` to match `pandas.DataFrame.groupby`.</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s2">"float64"</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">corr</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">method</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"pearson"</span><span class="p">,</span> |
| <span class="n">min_periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Compute pairwise correlation of columns, excluding NA/null values.</span> |
| |
| <span class="sd"> .. versionadded:: 4.0.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> method : {'pearson', 'spearman', 'kendall'}</span> |
| <span class="sd"> * pearson : standard correlation coefficient</span> |
| <span class="sd"> * spearman : Spearman rank correlation</span> |
| <span class="sd"> * kendall : Kendall Tau correlation coefficient</span> |
| |
| <span class="sd"> min_periods : int, default 1</span> |
| <span class="sd"> Minimum number of observations in window required to have a value</span> |
| <span class="sd"> (otherwise result is NA).</span> |
| |
| <span class="sd"> numeric_only : bool, default False</span> |
| <span class="sd"> Include only `float`, `int` or `boolean` data.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.corrwith</span> |
| <span class="sd"> Series.corr</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> 1. Pearson, Kendall and Spearman correlation are currently computed using pairwise</span> |
| <span class="sd"> complete observations.</span> |
| |
| <span class="sd"> 2. The complexity of Kendall correlation is O(#row * #row), if the dataset is too</span> |
| <span class="sd"> large, sampling ahead of correlation computation is recommended.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame(</span> |
| <span class="sd"> ... {"A": [0, 0, 0, 1, 1, 2], "B": [-1, 2, 3, 5, 6, 0], "C": [4, 6, 5, 1, 3, 0]},</span> |
| <span class="sd"> ... columns=["A", "B", "C"])</span> |
| <span class="sd"> >>> df.groupby("A").corr()</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 0 B 1.000000 0.720577</span> |
| <span class="sd"> C 0.720577 1.000000</span> |
| <span class="sd"> 1 B 1.000000 1.000000</span> |
| <span class="sd"> C 1.000000 1.000000</span> |
| <span class="sd"> 2 B NaN NaN</span> |
| <span class="sd"> C NaN NaN</span> |
| |
| <span class="sd"> >>> df.groupby("A").corr(min_periods=2)</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 0 B 1.000000 0.720577</span> |
| <span class="sd"> C 0.720577 1.000000</span> |
| <span class="sd"> 1 B 1.000000 1.000000</span> |
| <span class="sd"> C 1.000000 1.000000</span> |
| <span class="sd"> 2 B NaN NaN</span> |
| <span class="sd"> C NaN NaN</span> |
| |
| <span class="sd"> >>> df.groupby("A").corr("spearman")</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 0 B 1.0 0.5</span> |
| <span class="sd"> C 0.5 1.0</span> |
| <span class="sd"> 1 B 1.0 1.0</span> |
| <span class="sd"> C 1.0 1.0</span> |
| <span class="sd"> 2 B NaN NaN</span> |
| <span class="sd"> C NaN NaN</span> |
| |
| <span class="sd"> >>> df.groupby("A").corr('kendall')</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 0 B 1.000000 0.333333</span> |
| <span class="sd"> C 0.333333 1.000000</span> |
| <span class="sd"> 1 B 1.000000 1.000000</span> |
| <span class="sd"> C 1.000000 1.000000</span> |
| <span class="sd"> 2 B 1.000000 NaN</span> |
| <span class="sd"> C NaN 1.000000</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">method</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="s2">"pearson"</span><span class="p">,</span> <span class="s2">"spearman"</span><span class="p">,</span> <span class="s2">"kendall"</span><span class="p">]:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="sa">f</span><span class="s2">"Invalid method </span><span class="si">{</span><span class="n">method</span><span class="si">}</span><span class="s2">"</span><span class="p">)</span> |
| |
| <span class="n">groupkey_names</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="nb">str</span><span class="p">(</span><span class="n">key</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">]</span> |
| <span class="n">internal</span><span class="p">,</span> <span class="n">agg_columns</span><span class="p">,</span> <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_prepare_reduce</span><span class="p">(</span> |
| <span class="n">groupkey_names</span><span class="o">=</span><span class="n">groupkey_names</span><span class="p">,</span> |
| <span class="n">accepted_spark_types</span><span class="o">=</span><span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">)</span> <span class="k">if</span> <span class="n">numeric_only</span> <span class="k">else</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">bool_to_numeric</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">numeric_labels</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">label</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">internal</span><span class="o">.</span><span class="n">spark_type_for</span><span class="p">(</span><span class="n">label</span><span class="p">),</span> <span class="p">(</span><span class="n">NumericType</span><span class="p">,</span> <span class="n">BooleanType</span><span class="p">))</span> |
| <span class="p">]</span> |
| <span class="n">numeric_scols</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">"double"</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">numeric_labels</span> |
| <span class="p">]</span> |
| <span class="n">numeric_col_names</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">numeric_labels</span><span class="p">]</span> |
| <span class="n">num_scols</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">numeric_scols</span><span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="n">index_1_col_name</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">"__groupby_corr_index_1_temp_column__"</span><span class="p">)</span> |
| <span class="n">index_2_col_name</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">"__groupby_corr_index_2_temp_column__"</span><span class="p">)</span> |
| |
| <span class="n">pair_scols</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">num_scols</span><span class="p">):</span> |
| <span class="k">for</span> <span class="n">j</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="n">num_scols</span><span class="p">):</span> |
| <span class="n">pair_scols</span><span class="o">.</span><span class="n">append</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">struct</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">i</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">index_1_col_name</span><span class="p">),</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">j</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">index_2_col_name</span><span class="p">),</span> |
| <span class="n">numeric_scols</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">CORRELATION_VALUE_1_COLUMN</span><span class="p">),</span> |
| <span class="n">numeric_scols</span><span class="p">[</span><span class="n">j</span><span class="p">]</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">CORRELATION_VALUE_2_COLUMN</span><span class="p">),</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="p">[</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span> <span class="o">*</span><span class="p">[</span><span class="n">F</span><span class="o">.</span><span class="n">inline</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="o">*</span><span class="n">pair_scols</span><span class="p">))])</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">compute</span><span class="p">(</span> |
| <span class="n">sdf</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">groupKeys</span><span class="o">=</span><span class="n">groupkey_names</span> <span class="o">+</span> <span class="p">[</span><span class="n">index_1_col_name</span><span class="p">,</span> <span class="n">index_2_col_name</span><span class="p">],</span> <span class="n">method</span><span class="o">=</span><span class="n">method</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">method</span> <span class="o">==</span> <span class="s2">"kendall"</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span> |
| <span class="n">CORRELATION_CORR_OUTPUT_COLUMN</span><span class="p">,</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">index_1_col_name</span><span class="p">)</span> <span class="o">==</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">index_2_col_name</span><span class="p">),</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="mf">1.0</span><span class="p">))</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">CORRELATION_CORR_OUTPUT_COLUMN</span><span class="p">)</span> |
| <span class="p">),</span> |
| <span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span> |
| <span class="n">CORRELATION_CORR_OUTPUT_COLUMN</span><span class="p">,</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">CORRELATION_COUNT_OUTPUT_COLUMN</span><span class="p">)</span> <span class="o"><</span> <span class="n">min_periods</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">None</span><span class="p">))</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">CORRELATION_CORR_OUTPUT_COLUMN</span><span class="p">)</span> |
| <span class="p">),</span> |
| <span class="p">)</span> |
| |
| <span class="n">auxiliary_col_name</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">"__groupby_corr_auxiliary_temp_column__"</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span> |
| <span class="n">auxiliary_col_name</span><span class="p">,</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">explode</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">index_1_col_name</span><span class="p">)</span> <span class="o">==</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">index_2_col_name</span><span class="p">),</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">([</span><span class="mi">0</span><span class="p">]),</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">]))</span> |
| <span class="p">),</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">[</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span> |
| <span class="o">*</span><span class="p">[</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">auxiliary_col_name</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">index_1_col_name</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">index_2_col_name</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">index_1_col_name</span><span class="p">),</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">auxiliary_col_name</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">index_2_col_name</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">index_1_col_name</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">index_2_col_name</span><span class="p">),</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">CORRELATION_CORR_OUTPUT_COLUMN</span><span class="p">),</span> |
| <span class="p">],</span> |
| <span class="p">)</span> |
| |
| <span class="n">array_col_name</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">"__groupby_corr_array_temp_column__"</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span> <span class="o">+</span> <span class="p">[</span><span class="n">index_1_col_name</span><span class="p">])</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">array_sort</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">collect_list</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">struct</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">index_2_col_name</span><span class="p">),</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">CORRELATION_CORR_OUTPUT_COLUMN</span><span class="p">),</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">array_col_name</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">num_scols</span><span class="p">):</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">auxiliary_col_name</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">array_col_name</span><span class="p">),</span> <span class="n">i</span><span class="p">))</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span> |
| <span class="n">numeric_col_names</span><span class="p">[</span><span class="n">i</span><span class="p">],</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="sa">f</span><span class="s2">"</span><span class="si">{</span><span class="n">auxiliary_col_name</span><span class="si">}</span><span class="s2">.</span><span class="si">{</span><span class="n">CORRELATION_CORR_OUTPUT_COLUMN</span><span class="si">}</span><span class="s2">"</span><span class="p">),</span> |
| <span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">groupkey_names</span> <span class="o">+</span> <span class="p">[</span><span class="n">index_1_col_name</span><span class="p">])</span> <span class="c1"># type: ignore[arg-type]</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">[</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span> <span class="o">+</span> <span class="n">numeric_col_names</span><span class="p">],</span> |
| <span class="o">*</span><span class="p">[</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="n">numeric_col_names</span><span class="p">),</span> <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">index_1_col_name</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">auxiliary_col_name</span><span class="p">),</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">monotonically_increasing_id</span><span class="p">()</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">),</span> |
| <span class="p">],</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">key</span><span class="p">)</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">groupkey_names</span> <span class="o">+</span> <span class="p">[</span><span class="n">auxiliary_col_name</span><span class="p">]</span> |
| <span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">]</span> |
| <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span> |
| <span class="p">),</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="n">numeric_labels</span><span class="p">,</span> |
| <span class="n">column_label_names</span><span class="o">=</span><span class="n">internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| |
| <span class="k">class</span> <span class="nc">SeriesGroupBy</span><span class="p">(</span><span class="n">GroupBy</span><span class="p">[</span><span class="n">Series</span><span class="p">]):</span> |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_build</span><span class="p">(</span> |
| <span class="n">psser</span><span class="p">:</span> <span class="n">Series</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">Label</span><span class="p">]],</span> <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"SeriesGroupBy"</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">any</span><span class="p">(</span> |
| <span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">psser</span><span class="p">,</span> <span class="n">col_or_s</span><span class="p">)</span> <span class="k">for</span> <span class="n">col_or_s</span> <span class="ow">in</span> <span class="n">by</span> |
| <span class="p">):</span> |
| <span class="n">psdf</span><span class="p">,</span> <span class="n">new_by_series</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_resolve_grouping_from_diff_dataframes</span><span class="p">(</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">to_frame</span><span class="p">(),</span> <span class="n">by</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">SeriesGroupBy</span><span class="p">(</span> |
| <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">name</span><span class="p">),</span> |
| <span class="n">new_by_series</span><span class="p">,</span> |
| <span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span> |
| <span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">new_by_series</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_resolve_grouping</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span> <span class="n">by</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">SeriesGroupBy</span><span class="p">(</span><span class="n">psser</span><span class="p">,</span> <span class="n">new_by_series</span><span class="p">,</span> <span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span> <span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">psser</span><span class="p">:</span> <span class="n">Series</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span> <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">):</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">as_index</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"as_index=False only valid with DataFrame"</span><span class="p">)</span> |
| <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">=</span><span class="n">psser</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span> |
| <span class="n">groupkeys</span><span class="o">=</span><span class="n">by</span><span class="p">,</span> |
| <span class="n">as_index</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">,</span> |
| <span class="n">column_labels_to_exclude</span><span class="o">=</span><span class="nb">set</span><span class="p">(),</span> |
| <span class="n">agg_columns_selected</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="n">agg_columns</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span> <span class="o">=</span> <span class="n">psser</span> |
| |
| <span class="k">def</span> <span class="fm">__getattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Any</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">MissingPandasLikeSeriesGroupBy</span><span class="p">,</span> <span class="n">item</span><span class="p">):</span> |
| <span class="n">property_or_func</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">MissingPandasLikeSeriesGroupBy</span><span class="p">,</span> <span class="n">item</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">property_or_func</span><span class="p">,</span> <span class="nb">property</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">property_or_func</span><span class="o">.</span><span class="n">fget</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">partial</span><span class="p">(</span><span class="n">property_or_func</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span> |
| <span class="k">raise</span> <span class="ne">AttributeError</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">_apply_series_op</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">op</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">"SeriesGroupBy"</span><span class="p">],</span> <span class="n">Series</span><span class="p">],</span> |
| <span class="n">should_resolve</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Series</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="n">DataError</span><span class="p">(</span><span class="s2">"No numeric types to aggregate"</span><span class="p">)</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">op</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">should_resolve</span><span class="p">:</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psser</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| |
| <span class="k">def</span> <span class="nf">_handle_output</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">agg_column_names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Series</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">agg_column_names</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psdf</span><span class="p">[</span><span class="n">agg_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]]</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">agg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">MissingPandasLikeSeriesGroupBy</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">aggregate</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">MissingPandasLikeSeriesGroupBy</span><span class="o">.</span><span class="n">aggregate</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">size</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Series</span><span class="p">:</span> |
| <span class="k">return</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">size</span><span class="p">()</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| |
| <span class="n">size</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">size</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <span class="c1"># TODO: add keep parameter</span> |
| <div class="viewcode-block" id="SeriesGroupBy.nsmallest"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.SeriesGroupBy.nsmallest.html#pyspark.pandas.groupby.SeriesGroupBy.nsmallest">[docs]</a> <span class="k">def</span> <span class="nf">nsmallest</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-></span> <span class="n">Series</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return the smallest `n` elements.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> n : int</span> |
| <span class="sd"> Number of items to retrieve.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.nsmallest</span> |
| <span class="sd"> pyspark.pandas.DataFrame.nsmallest</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],</span> |
| <span class="sd"> ... 'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b'])</span> |
| |
| <span class="sd"> >>> df.groupby(['a'])['b'].nsmallest(1).sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1 0 1</span> |
| <span class="sd"> 2 3 2</span> |
| <span class="sd"> 3 6 3</span> |
| <span class="sd"> Name: b, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"nsmallest do not support multi-index now"</span><span class="p">)</span> |
| |
| <span class="n">groupkey_col_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">[</span><span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">scol</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">,</span> <span class="n">groupkey_col_names</span><span class="p">)],</span> |
| <span class="o">*</span><span class="p">[</span> |
| <span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">scol</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> |
| <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_col_names</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">.</span><span class="n">asc</span><span class="p">(),</span> |
| <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">temp_rank_column</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">"__rank__"</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">)</span> <span class="o"><=</span> <span class="n">n</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_col_names</span><span class="p">]</span> |
| <span class="o">+</span> <span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span> |
| <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="p">),</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">]</span> |
| <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span> |
| <span class="p">),</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">(</span> |
| <span class="p">[</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_col_names</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="o">+</span> <span class="p">[</span> |
| <span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">field</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="p">),</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_column_label</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <span class="c1"># TODO: add keep parameter</span> |
| <div class="viewcode-block" id="SeriesGroupBy.nlargest"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.SeriesGroupBy.nlargest.html#pyspark.pandas.groupby.SeriesGroupBy.nlargest">[docs]</a> <span class="k">def</span> <span class="nf">nlargest</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-></span> <span class="n">Series</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return the first n rows ordered by columns in descending order in group.</span> |
| |
| <span class="sd"> Return the first n rows with the smallest values in columns, in descending order.</span> |
| <span class="sd"> The columns that are not specified are returned as well, but not used for ordering.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> n : int</span> |
| <span class="sd"> Number of items to retrieve.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.nlargest</span> |
| <span class="sd"> pyspark.pandas.DataFrame.nlargest</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],</span> |
| <span class="sd"> ... 'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b'])</span> |
| |
| <span class="sd"> >>> df.groupby(['a'])['b'].nlargest(1).sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1 1 2</span> |
| <span class="sd"> 2 4 3</span> |
| <span class="sd"> 3 7 4</span> |
| <span class="sd"> Name: b, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"nlargest do not support multi-index now"</span><span class="p">)</span> |
| |
| <span class="n">groupkey_col_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">[</span><span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">scol</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">,</span> <span class="n">groupkey_col_names</span><span class="p">)],</span> |
| <span class="o">*</span><span class="p">[</span> |
| <span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">scol</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> |
| <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_col_names</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">.</span><span class="n">desc</span><span class="p">(),</span> |
| <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">temp_rank_column</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">"__rank__"</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">)</span> <span class="o"><=</span> <span class="n">n</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_col_names</span><span class="p">]</span> |
| <span class="o">+</span> <span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span> |
| <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="p">),</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">]</span> |
| <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span> |
| <span class="p">),</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">(</span> |
| <span class="p">[</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_col_names</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="o">+</span> <span class="p">[</span> |
| <span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">field</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="p">),</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_column_label</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <span class="c1"># TODO: add bins, normalize parameter</span> |
| <div class="viewcode-block" id="SeriesGroupBy.value_counts"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.SeriesGroupBy.value_counts.html#pyspark.pandas.groupby.SeriesGroupBy.value_counts">[docs]</a> <span class="k">def</span> <span class="nf">value_counts</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">sort</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">ascending</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Series</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Compute group sizes.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> sort : boolean, default None</span> |
| <span class="sd"> Sort by frequencies.</span> |
| <span class="sd"> ascending : boolean, default False</span> |
| <span class="sd"> Sort in ascending order.</span> |
| <span class="sd"> dropna : boolean, default True</span> |
| <span class="sd"> Don't include counts of NaN.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 2, 2, 3, 3, 3],</span> |
| <span class="sd"> ... 'B': [1, 1, 2, 3, 3, np.nan]},</span> |
| <span class="sd"> ... columns=['A', 'B'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 1.0</span> |
| <span class="sd"> 1 2 1.0</span> |
| <span class="sd"> 2 2 2.0</span> |
| <span class="sd"> 3 3 3.0</span> |
| <span class="sd"> 4 3 3.0</span> |
| <span class="sd"> 5 3 NaN</span> |
| |
| <span class="sd"> >>> df.groupby('A')['B'].value_counts().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 1 1.0 1</span> |
| <span class="sd"> 2 1.0 1</span> |
| <span class="sd"> 2.0 1</span> |
| <span class="sd"> 3 3.0 2</span> |
| <span class="sd"> Name: count, dtype: int64</span> |
| |
| <span class="sd"> Don't include counts of NaN when dropna is False.</span> |
| |
| <span class="sd"> >>> df.groupby('A')['B'].value_counts(</span> |
| <span class="sd"> ... dropna=False).sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 1 1.0 1</span> |
| <span class="sd"> 2 1.0 1</span> |
| <span class="sd"> 2.0 1</span> |
| <span class="sd"> 3 3.0 2</span> |
| <span class="sd"> NaN 1</span> |
| <span class="sd"> Name: count, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span> |
| <span class="s2">"The resulting Series will have a fixed name of 'count' from 4.0.0."</span><span class="p">,</span> |
| <span class="ne">FutureWarning</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">groupkeys</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> |
| <span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">))]</span> |
| <span class="n">groupkey_cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)]</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| |
| <span class="n">agg_column</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_cols</span><span class="p">)</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">withColumnRenamed</span><span class="p">(</span><span class="s2">"count"</span><span class="p">,</span> <span class="n">agg_column</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span><span class="p">:</span> |
| <span class="n">_groupkey_column_names</span> <span class="o">=</span> <span class="n">groupkey_names</span><span class="p">[:</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)]</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span><span class="n">subset</span><span class="o">=</span><span class="n">_groupkey_column_names</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">dropna</span><span class="p">:</span> |
| <span class="n">_agg_columns_names</span> <span class="o">=</span> <span class="n">groupkey_names</span><span class="p">[</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span> <span class="p">:]</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span><span class="n">subset</span><span class="o">=</span><span class="n">_agg_columns_names</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">sort</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">ascending</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">agg_column</span><span class="p">)</span><span class="o">.</span><span class="n">asc</span><span class="p">())</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">agg_column</span><span class="p">)</span><span class="o">.</span><span class="n">desc</span><span class="p">())</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">groupkeys</span><span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[(</span><span class="s2">"count"</span><span class="p">,)],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">agg_column</span><span class="p">)],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="SeriesGroupBy.unique"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.SeriesGroupBy.unique.html#pyspark.pandas.groupby.SeriesGroupBy.unique">[docs]</a> <span class="k">def</span> <span class="nf">unique</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Series</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Return unique values in group.</span> |
| |
| <span class="sd"> Unique is returned in order of unknown. It does NOT sort.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.unique</span> |
| <span class="sd"> pyspark.pandas.Index.unique</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],</span> |
| <span class="sd"> ... 'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b'])</span> |
| |
| <span class="sd"> >>> df.groupby(['a'])['b'].unique().sort_index() # doctest: +SKIP</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1 [1, 2]</span> |
| <span class="sd"> 2 [2, 3]</span> |
| <span class="sd"> 3 [3, 4]</span> |
| <span class="sd"> Name: b, dtype: object</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">collect_set</span><span class="p">)</span></div> |
| |
| |
| <span class="k">def</span> <span class="nf">is_multi_agg_with_relabel</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Check whether the kwargs pass to .agg look like multi-agg with relabling.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> **kwargs : dict</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> bool</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> is_multi_agg_with_relabel(a='max')</span> |
| <span class="sd"> False</span> |
| <span class="sd"> >>> is_multi_agg_with_relabel(a_max=('a', 'max'),</span> |
| <span class="sd"> ... a_min=('a', 'min'))</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> is_multi_agg_with_relabel()</span> |
| <span class="sd"> False</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">kwargs</span><span class="p">:</span> |
| <span class="k">return</span> <span class="kc">False</span> |
| <span class="k">return</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">v</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">values</span><span class="p">())</span> |
| |
| |
| <span class="k">def</span> <span class="nf">normalize_keyword_aggregation</span><span class="p">(</span> |
| <span class="n">kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="nb">str</span><span class="p">]],</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="n">Tuple</span><span class="p">]]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Normalize user-provided kwargs.</span> |
| |
| <span class="sd"> Transforms from the new ``Dict[str, NamedAgg]`` style kwargs</span> |
| <span class="sd"> to the old defaultdict[str, List[scalar]].</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> kwargs : dict</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> aggspec : dict</span> |
| <span class="sd"> The transformed kwargs.</span> |
| <span class="sd"> columns : List[str]</span> |
| <span class="sd"> The user-provided keys.</span> |
| <span class="sd"> order : List[Tuple[str, str]]</span> |
| <span class="sd"> Pairs of the input and output column names.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> normalize_keyword_aggregation({'output': ('input', 'sum')})</span> |
| <span class="sd"> (defaultdict(<class 'list'>, {'input': ['sum']}), ['output'], [('input', 'sum')])</span> |
| <span class="sd"> """</span> |
| <span class="n">aggspec</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">list</span><span class="p">)</span> |
| <span class="n">order</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Tuple</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">columns</span><span class="p">,</span> <span class="n">pairs</span> <span class="o">=</span> <span class="nb">zip</span><span class="p">(</span><span class="o">*</span><span class="n">kwargs</span><span class="o">.</span><span class="n">items</span><span class="p">())</span> |
| |
| <span class="k">for</span> <span class="n">column</span><span class="p">,</span> <span class="n">aggfunc</span> <span class="ow">in</span> <span class="n">pairs</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">column</span> <span class="ow">in</span> <span class="n">aggspec</span><span class="p">:</span> |
| <span class="n">aggspec</span><span class="p">[</span><span class="n">column</span><span class="p">]</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">aggfunc</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">aggspec</span><span class="p">[</span><span class="n">column</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="n">aggfunc</span><span class="p">]</span> |
| |
| <span class="n">order</span><span class="o">.</span><span class="n">append</span><span class="p">((</span><span class="n">column</span><span class="p">,</span> <span class="n">aggfunc</span><span class="p">))</span> |
| <span class="c1"># For MultiIndex, we need to flatten the tuple, e.g. (('y', 'A'), 'max') needs to be</span> |
| <span class="c1"># flattened to ('y', 'A', 'max'), it won't do anything on normal Index.</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">order</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">],</span> <span class="nb">tuple</span><span class="p">):</span> |
| <span class="n">order</span> <span class="o">=</span> <span class="p">[(</span><span class="o">*</span><span class="n">levs</span><span class="p">,</span> <span class="n">method</span><span class="p">)</span> <span class="k">for</span> <span class="n">levs</span><span class="p">,</span> <span class="n">method</span> <span class="ow">in</span> <span class="n">order</span><span class="p">]</span> |
| <span class="k">return</span> <span class="n">aggspec</span><span class="p">,</span> <span class="nb">list</span><span class="p">(</span><span class="n">columns</span><span class="p">),</span> <span class="n">order</span> |
| |
| |
| <span class="k">def</span> <span class="nf">_test</span><span class="p">()</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="kn">import</span> <span class="nn">os</span> |
| <span class="kn">import</span> <span class="nn">doctest</span> |
| <span class="kn">import</span> <span class="nn">sys</span> |
| <span class="kn">import</span> <span class="nn">numpy</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| <span class="kn">import</span> <span class="nn">pyspark.pandas.groupby</span> |
| |
| <span class="n">os</span><span class="o">.</span><span class="n">chdir</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">"SPARK_HOME"</span><span class="p">])</span> |
| |
| <span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">groupby</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"np"</span><span class="p">]</span> <span class="o">=</span> <span class="n">numpy</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"ps"</span><span class="p">]</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span> |
| <span class="n">spark</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">"local[4]"</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">"pyspark.pandas.groupby tests"</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| <span class="p">)</span> |
| <span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span> |
| <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">groupby</span><span class="p">,</span> |
| <span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span> |
| <span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span> |
| <span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> |
| |
| |
| <span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span> |
| <span class="n">_test</span><span class="p">()</span> |
| </pre></div> |
| |
| </article> |
| |
| |
| |
| <footer class="bd-footer-article"> |
| |
| <div class="footer-article-items footer-article__inner"> |
| |
| <div class="footer-article-item"><!-- Previous / next buttons --> |
| <div class="prev-next-area"> |
| </div></div> |
| |
| </div> |
| |
| </footer> |
| |
| </div> |
| |
| |
| |
| |
| </div> |
| <footer class="bd-footer-content"> |
| |
| </footer> |
| |
| </main> |
| </div> |
| </div> |
| |
| <!-- Scripts loaded after <body> so the DOM is not blocked --> |
| <script src="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script> |
| <script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script> |
| |
| <footer class="bd-footer"> |
| <div class="bd-footer__inner bd-page-width"> |
| |
| <div class="footer-items__start"> |
| |
| <div class="footer-item"><p class="copyright"> |
| Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. |
| </p></div> |
| |
| <div class="footer-item"> |
| <p class="sphinx-version"> |
| Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0. |
| <br/> |
| </p> |
| </div> |
| |
| </div> |
| |
| |
| <div class="footer-items__end"> |
| |
| <div class="footer-item"><p class="theme-version"> |
| Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3. |
| </p></div> |
| |
| </div> |
| |
| </div> |
| |
| </footer> |
| </body> |
| </html> |