blob: 40c2cb332966e8d8fa287c59dff96abc5683b1ad [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title>pyspark.pandas.groupby &#8212; PySpark 3.3.1 documentation</title>
<link rel="stylesheet" href="../../../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css">
<link rel="stylesheet"
href="../../../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet"
href="../../../_static/vendor/open-sans_all/1.44.1/index.css">
<link rel="stylesheet"
href="../../../_static/vendor/lato_latin-ext/1.44.1/index.css">
<link rel="stylesheet" href="../../../_static/basic.css" type="text/css" />
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" />
<link rel="preload" as="script" href="../../../_static/js/index.3da636dd464baa7582d2.js">
<script id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/jquery.js"></script>
<script src="../../../_static/underscore.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/language_data.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script>
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="en" />
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main">
<div class="container-xl">
<a class="navbar-brand" href="../../../index.html">
<img src="../../../_static/spark-logo-reverse.png" class="logo" alt="logo" />
</a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar-menu" class="col-lg-9 collapse navbar-collapse">
<ul id="navbar-main-elements" class="navbar-nav mr-auto">
<li class="nav-item ">
<a class="nav-link" href="../../../getting_started/index.html">Getting Started</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../user_guide/index.html">User Guide</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../reference/index.html">API Reference</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../development/index.html">Development</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../../../migration_guide/index.html">Migration Guide</a>
</li>
</ul>
<ul class="navbar-nav">
</ul>
</div>
</div>
</nav>
<div class="container-xl">
<div class="row">
<div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../../../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form>
<nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
<ul class="nav bd-sidenav">
</ul>
</nav>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
<nav id="bd-toc-nav">
<ul class="nav section-nav flex-column">
</ul>
</nav>
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<h1>Source code for pyspark.pandas.groupby</h1><div class="highlight"><pre>
<span></span><span class="c1">#</span>
<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c1"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c1"># this work for additional information regarding copyright ownership.</span>
<span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c1"># (the &quot;License&quot;); you may not use this file except in compliance with</span>
<span class="c1"># the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c1"># distributed under the License is distributed on an &quot;AS IS&quot; BASIS,</span>
<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="c1">#</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd">A wrapper for GroupedData to behave similar to pandas GroupBy.</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">abc</span> <span class="kn">import</span> <span class="n">ABCMeta</span><span class="p">,</span> <span class="n">abstractmethod</span>
<span class="kn">import</span> <span class="nn">inspect</span>
<span class="kn">from</span> <span class="nn">collections</span> <span class="kn">import</span> <span class="n">defaultdict</span><span class="p">,</span> <span class="n">namedtuple</span>
<span class="kn">from</span> <span class="nn">distutils.version</span> <span class="kn">import</span> <span class="n">LooseVersion</span>
<span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">partial</span>
<span class="kn">from</span> <span class="nn">itertools</span> <span class="kn">import</span> <span class="n">product</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">Any</span><span class="p">,</span>
<span class="n">Callable</span><span class="p">,</span>
<span class="n">Dict</span><span class="p">,</span>
<span class="n">Generic</span><span class="p">,</span>
<span class="n">Iterator</span><span class="p">,</span>
<span class="n">Mapping</span><span class="p">,</span>
<span class="n">List</span><span class="p">,</span>
<span class="n">Optional</span><span class="p">,</span>
<span class="n">Sequence</span><span class="p">,</span>
<span class="n">Set</span><span class="p">,</span>
<span class="n">Tuple</span><span class="p">,</span>
<span class="n">Union</span><span class="p">,</span>
<span class="n">cast</span><span class="p">,</span>
<span class="n">TYPE_CHECKING</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">import</span> <span class="nn">warnings</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<span class="kn">from</span> <span class="nn">pandas.api.types</span> <span class="kn">import</span> <span class="n">is_hashable</span><span class="p">,</span> <span class="n">is_list_like</span> <span class="c1"># type: ignore[attr-defined]</span>
<span class="k">if</span> <span class="n">LooseVersion</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">__version__</span><span class="p">)</span> <span class="o">&gt;=</span> <span class="n">LooseVersion</span><span class="p">(</span><span class="s2">&quot;1.3.0&quot;</span><span class="p">):</span>
<span class="kn">from</span> <span class="nn">pandas.core.common</span> <span class="kn">import</span> <span class="n">_builtin_table</span> <span class="c1"># type: ignore[attr-defined]</span>
<span class="k">else</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pandas.core.base</span> <span class="kn">import</span> <span class="n">SelectionMixin</span>
<span class="n">_builtin_table</span> <span class="o">=</span> <span class="n">SelectionMixin</span><span class="o">.</span><span class="n">_builtin_table</span> <span class="c1"># type: ignore[attr-defined]</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Column</span><span class="p">,</span> <span class="n">DataFrame</span> <span class="k">as</span> <span class="n">SparkDataFrame</span><span class="p">,</span> <span class="n">Window</span><span class="p">,</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">F</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">NumericType</span><span class="p">,</span>
<span class="n">StructField</span><span class="p">,</span>
<span class="n">StructType</span><span class="p">,</span>
<span class="n">StringType</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">pandas</span> <span class="k">as</span> <span class="n">ps</span> <span class="c1"># For running doctests and reference resolution in PyCharm.</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas._typing</span> <span class="kn">import</span> <span class="n">Axis</span><span class="p">,</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">Label</span><span class="p">,</span> <span class="n">Name</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.typedef</span> <span class="kn">import</span> <span class="n">infer_return_type</span><span class="p">,</span> <span class="n">DataFrameType</span><span class="p">,</span> <span class="n">ScalarType</span><span class="p">,</span> <span class="n">SeriesType</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.frame</span> <span class="kn">import</span> <span class="n">DataFrame</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.internal</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">InternalField</span><span class="p">,</span>
<span class="n">InternalFrame</span><span class="p">,</span>
<span class="n">HIDDEN_COLUMNS</span><span class="p">,</span>
<span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span>
<span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">,</span>
<span class="n">SPARK_DEFAULT_SERIES_NAME</span><span class="p">,</span>
<span class="n">SPARK_INDEX_NAME_PATTERN</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.missing.groupby</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">MissingPandasLikeDataFrameGroupBy</span><span class="p">,</span>
<span class="n">MissingPandasLikeSeriesGroupBy</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span><span class="p">,</span> <span class="n">first_series</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.spark</span> <span class="kn">import</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">SF</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.config</span> <span class="kn">import</span> <span class="n">get_option</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.utils</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">align_diff_frames</span><span class="p">,</span>
<span class="n">is_name_like_tuple</span><span class="p">,</span>
<span class="n">is_name_like_value</span><span class="p">,</span>
<span class="n">name_like_string</span><span class="p">,</span>
<span class="n">same_anchor</span><span class="p">,</span>
<span class="n">scol_for</span><span class="p">,</span>
<span class="n">verify_temp_column_name</span><span class="p">,</span>
<span class="n">log_advice</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.spark.utils</span> <span class="kn">import</span> <span class="n">as_nullable_spark_type</span><span class="p">,</span> <span class="n">force_decimal_precision_scale</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.exceptions</span> <span class="kn">import</span> <span class="n">DataError</span>
<span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">RollingGroupby</span><span class="p">,</span> <span class="n">ExpandingGroupby</span>
<span class="c1"># to keep it the same as pandas</span>
<span class="n">NamedAgg</span> <span class="o">=</span> <span class="n">namedtuple</span><span class="p">(</span><span class="s2">&quot;NamedAgg&quot;</span><span class="p">,</span> <span class="p">[</span><span class="s2">&quot;column&quot;</span><span class="p">,</span> <span class="s2">&quot;aggfunc&quot;</span><span class="p">])</span>
<span class="k">class</span> <span class="nc">GroupBy</span><span class="p">(</span><span class="n">Generic</span><span class="p">[</span><span class="n">FrameLike</span><span class="p">],</span> <span class="n">metaclass</span><span class="o">=</span><span class="n">ABCMeta</span><span class="p">):</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> :ivar _psdf: The parent dataframe that is used to perform the groupby</span>
<span class="sd"> :type _psdf: DataFrame</span>
<span class="sd"> :ivar _groupkeys: The list of keys that will be used to perform the grouping</span>
<span class="sd"> :type _groupkeys: List[Series]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span>
<span class="n">groupkeys</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span>
<span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span>
<span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span>
<span class="n">column_labels_to_exclude</span><span class="p">:</span> <span class="n">Set</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span>
<span class="n">agg_columns_selected</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span>
<span class="n">agg_columns</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span>
<span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span> <span class="o">=</span> <span class="n">psdf</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span> <span class="o">=</span> <span class="n">groupkeys</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span> <span class="o">=</span> <span class="n">as_index</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span> <span class="o">=</span> <span class="n">dropna</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_column_labels_to_exclude</span> <span class="o">=</span> <span class="n">column_labels_to_exclude</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span> <span class="o">=</span> <span class="n">agg_columns_selected</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> <span class="o">=</span> <span class="n">agg_columns</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">_groupkeys_scols</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">]:</span>
<span class="k">return</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">]</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">_agg_columns_scols</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">]:</span>
<span class="k">return</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">]</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_apply_series_op</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">op</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">&quot;SeriesGroupBy&quot;</span><span class="p">],</span> <span class="n">Series</span><span class="p">],</span>
<span class="n">should_resolve</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="k">pass</span>
<span class="nd">@abstractmethod</span>
<span class="k">def</span> <span class="nf">_cleanup_and_return</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="k">pass</span>
<span class="c1"># TODO: Series support is not implemented yet.</span>
<span class="c1"># TODO: not all arguments are implemented comparing to pandas&#39; for now.</span>
<span class="k">def</span> <span class="nf">aggregate</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">func_or_funcs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;Aggregate using one or more operations over the specified axis.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> func_or_funcs : dict, str or list</span>
<span class="sd"> a dict mapping from column name (string) to</span>
<span class="sd"> aggregate functions (string or list of strings).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or DataFrame</span>
<span class="sd"> The return can be:</span>
<span class="sd"> * Series : when DataFrame.agg is called with a single function</span>
<span class="sd"> * DataFrame : when DataFrame.agg is called with several functions</span>
<span class="sd"> Return Series or DataFrame.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> `agg` is an alias for `aggregate`. Use the alias.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [1, 1, 2, 2],</span>
<span class="sd"> ... &#39;B&#39;: [1, 2, 3, 4],</span>
<span class="sd"> ... &#39;C&#39;: [0.362, 0.227, 1.267, -0.562]},</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B C</span>
<span class="sd"> 0 1 1 0.362</span>
<span class="sd"> 1 1 2 0.227</span>
<span class="sd"> 2 2 3 1.267</span>
<span class="sd"> 3 2 4 -0.562</span>
<span class="sd"> Different aggregations per column</span>
<span class="sd"> &gt;&gt;&gt; aggregated = df.groupby(&#39;A&#39;).agg({&#39;B&#39;: &#39;min&#39;, &#39;C&#39;: &#39;sum&#39;})</span>
<span class="sd"> &gt;&gt;&gt; aggregated[[&#39;B&#39;, &#39;C&#39;]].sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B C</span>
<span class="sd"> A</span>
<span class="sd"> 1 1 0.589</span>
<span class="sd"> 2 3 0.705</span>
<span class="sd"> &gt;&gt;&gt; aggregated = df.groupby(&#39;A&#39;).agg({&#39;B&#39;: [&#39;min&#39;, &#39;max&#39;]})</span>
<span class="sd"> &gt;&gt;&gt; aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B</span>
<span class="sd"> min max</span>
<span class="sd"> A</span>
<span class="sd"> 1 1 2</span>
<span class="sd"> 2 3 4</span>
<span class="sd"> &gt;&gt;&gt; aggregated = df.groupby(&#39;A&#39;).agg(&#39;min&#39;)</span>
<span class="sd"> &gt;&gt;&gt; aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B C</span>
<span class="sd"> A</span>
<span class="sd"> 1 1 0.227</span>
<span class="sd"> 2 3 -0.562</span>
<span class="sd"> &gt;&gt;&gt; aggregated = df.groupby(&#39;A&#39;).agg([&#39;min&#39;, &#39;max&#39;])</span>
<span class="sd"> &gt;&gt;&gt; aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B C</span>
<span class="sd"> min max min max</span>
<span class="sd"> A</span>
<span class="sd"> 1 1 2 0.227 0.362</span>
<span class="sd"> 2 3 4 -0.562 1.267</span>
<span class="sd"> To control the output names with different aggregations per column, pandas-on-Spark</span>
<span class="sd"> also supports &#39;named aggregation&#39; or nested renaming in .agg. It can also be</span>
<span class="sd"> used when applying multiple aggregation functions to specific columns.</span>
<span class="sd"> &gt;&gt;&gt; aggregated = df.groupby(&#39;A&#39;).agg(b_max=ps.NamedAgg(column=&#39;B&#39;, aggfunc=&#39;max&#39;))</span>
<span class="sd"> &gt;&gt;&gt; aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> b_max</span>
<span class="sd"> A</span>
<span class="sd"> 1 2</span>
<span class="sd"> 2 4</span>
<span class="sd"> &gt;&gt;&gt; aggregated = df.groupby(&#39;A&#39;).agg(b_max=(&#39;B&#39;, &#39;max&#39;), b_min=(&#39;B&#39;, &#39;min&#39;))</span>
<span class="sd"> &gt;&gt;&gt; aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> b_max b_min</span>
<span class="sd"> A</span>
<span class="sd"> 1 2 1</span>
<span class="sd"> 2 4 3</span>
<span class="sd"> &gt;&gt;&gt; aggregated = df.groupby(&#39;A&#39;).agg(b_max=(&#39;B&#39;, &#39;max&#39;), c_min=(&#39;C&#39;, &#39;min&#39;))</span>
<span class="sd"> &gt;&gt;&gt; aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> b_max c_min</span>
<span class="sd"> A</span>
<span class="sd"> 1 2 0.227</span>
<span class="sd"> 2 4 -0.562</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># I think current implementation of func and arguments in pandas-on-Spark for aggregate</span>
<span class="c1"># is different than pandas, later once arguments are added, this could be removed.</span>
<span class="k">if</span> <span class="n">func_or_funcs</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">kwargs</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;No aggregation argument or function specified.&quot;</span><span class="p">)</span>
<span class="n">relabeling</span> <span class="o">=</span> <span class="n">func_or_funcs</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">is_multi_agg_with_relabel</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">if</span> <span class="n">relabeling</span><span class="p">:</span>
<span class="p">(</span>
<span class="n">func_or_funcs</span><span class="p">,</span>
<span class="n">columns</span><span class="p">,</span>
<span class="n">order</span><span class="p">,</span>
<span class="p">)</span> <span class="o">=</span> <span class="n">normalize_keyword_aggregation</span><span class="p">(</span> <span class="c1"># type: ignore[assignment]</span>
<span class="n">kwargs</span>
<span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">func_or_funcs</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="nb">list</span><span class="p">)):</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">func_or_funcs</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">or</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span>
<span class="n">is_name_like_value</span><span class="p">(</span><span class="n">key</span><span class="p">)</span>
<span class="ow">and</span> <span class="p">(</span>
<span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span>
<span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span>
<span class="ow">and</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">value</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">for</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">func_or_funcs</span><span class="o">.</span><span class="n">items</span><span class="p">()</span>
<span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;aggs must be a dict mapping from column name &quot;</span>
<span class="s2">&quot;to aggregate functions (string or list of strings).&quot;</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">agg_cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">col</span><span class="o">.</span><span class="n">name</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">]</span>
<span class="n">func_or_funcs</span> <span class="o">=</span> <span class="p">{</span><span class="n">col</span><span class="p">:</span> <span class="n">func_or_funcs</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">agg_cols</span><span class="p">}</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_groupby</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span> <span class="n">func_or_funcs</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span><span class="p">:</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span>
<span class="n">subset</span><span class="o">=</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span><span class="p">:</span>
<span class="n">should_drop_index</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span>
<span class="n">i</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">gkey</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span> <span class="k">if</span> <span class="n">gkey</span><span class="o">.</span><span class="n">_psdf</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">should_drop_index</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">level</span><span class="o">=</span><span class="n">should_drop_index</span><span class="p">,</span> <span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">should_drop_index</span><span class="p">)</span> <span class="o">&lt;</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">):</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span>
<span class="k">if</span> <span class="n">relabeling</span><span class="p">:</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">order</span><span class="p">]</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">columns</span> <span class="c1"># type: ignore[assignment]</span>
<span class="k">return</span> <span class="n">psdf</span>
<span class="n">agg</span> <span class="o">=</span> <span class="n">aggregate</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_spark_groupby</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span>
<span class="n">func</span><span class="p">:</span> <span class="n">Mapping</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]],</span>
<span class="n">groupkeys</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Series</span><span class="p">]</span> <span class="o">=</span> <span class="p">(),</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">InternalFrame</span><span class="p">:</span>
<span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">))]</span>
<span class="n">groupkey_scols</span> <span class="o">=</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)]</span>
<span class="n">multi_aggs</span> <span class="o">=</span> <span class="nb">any</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">func</span><span class="o">.</span><span class="n">values</span><span class="p">())</span>
<span class="n">reordered</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">data_columns</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">func</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
<span class="n">label</span> <span class="o">=</span> <span class="n">key</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">key</span><span class="p">,)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">!=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;The length of the key must be the same as the column label level.&quot;</span><span class="p">)</span>
<span class="k">for</span> <span class="n">aggfunc</span> <span class="ow">in</span> <span class="p">[</span><span class="n">value</span><span class="p">]</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="n">value</span><span class="p">:</span>
<span class="n">column_label</span> <span class="o">=</span> <span class="nb">tuple</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">+</span> <span class="p">[</span><span class="n">aggfunc</span><span class="p">])</span> <span class="k">if</span> <span class="n">multi_aggs</span> <span class="k">else</span> <span class="n">label</span>
<span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">column_label</span><span class="p">)</span>
<span class="n">data_col</span> <span class="o">=</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">column_label</span><span class="p">)</span>
<span class="n">data_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">data_col</span><span class="p">)</span>
<span class="n">col_name</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">if</span> <span class="n">aggfunc</span> <span class="o">==</span> <span class="s2">&quot;nunique&quot;</span><span class="p">:</span>
<span class="n">reordered</span><span class="o">.</span><span class="n">append</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">expr</span><span class="p">(</span><span class="s2">&quot;count(DISTINCT `</span><span class="si">{0}</span><span class="s2">`) as `</span><span class="si">{1}</span><span class="s2">`&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">col_name</span><span class="p">,</span> <span class="n">data_col</span><span class="p">))</span>
<span class="p">)</span>
<span class="c1"># Implement &quot;quartiles&quot; aggregate function for ``describe``.</span>
<span class="k">elif</span> <span class="n">aggfunc</span> <span class="o">==</span> <span class="s2">&quot;quartiles&quot;</span><span class="p">:</span>
<span class="n">reordered</span><span class="o">.</span><span class="n">append</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">expr</span><span class="p">(</span>
<span class="s2">&quot;percentile_approx(`</span><span class="si">{0}</span><span class="s2">`, array(0.25, 0.5, 0.75)) as `</span><span class="si">{1}</span><span class="s2">`&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">col_name</span><span class="p">,</span> <span class="n">data_col</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">reordered</span><span class="o">.</span><span class="n">append</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">expr</span><span class="p">(</span><span class="s2">&quot;</span><span class="si">{1}</span><span class="s2">(`</span><span class="si">{0}</span><span class="s2">`) as `</span><span class="si">{2}</span><span class="s2">`&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">col_name</span><span class="p">,</span> <span class="n">aggfunc</span><span class="p">,</span> <span class="n">data_col</span><span class="p">))</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">groupkey_scols</span> <span class="o">+</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">reordered</span><span class="p">)</span>
<span class="k">return</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span>
<span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">groupkeys</span><span class="p">],</span>
<span class="n">index_fields</span><span class="o">=</span><span class="p">[</span>
<span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span>
<span class="p">],</span>
<span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">,</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_columns</span><span class="p">],</span>
<span class="p">)</span>
<div class="viewcode-block" id="GroupBy.count"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.count.html#pyspark.pandas.groupby.GroupBy.count">[docs]</a> <span class="k">def</span> <span class="nf">count</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute count of group, excluding missing values.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [1, 1, 2, 1, 2],</span>
<span class="sd"> ... &#39;B&#39;: [np.nan, 2, 3, 4, 5],</span>
<span class="sd"> ... &#39;C&#39;: [1, 2, 1, 1, 2]}, columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;).count().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B C</span>
<span class="sd"> A</span>
<span class="sd"> 1 2 3</span>
<span class="sd"> 2 2 2</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></div>
<span class="c1"># TODO: We should fix See Also when Series implementation is finished.</span>
<div class="viewcode-block" id="GroupBy.first"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.first.html#pyspark.pandas.groupby.GroupBy.first">[docs]</a> <span class="k">def</span> <span class="nf">first</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute first of group values.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">first</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.last"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.last.html#pyspark.pandas.groupby.GroupBy.last">[docs]</a> <span class="k">def</span> <span class="nf">last</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute last of group values.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">last</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">ignorenulls</span><span class="o">=</span><span class="kc">True</span><span class="p">),</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">False</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.max"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.max.html#pyspark.pandas.groupby.GroupBy.max">[docs]</a> <span class="k">def</span> <span class="nf">max</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute max of group values.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></div>
<span class="c1"># TODO: examples should be updated.</span>
<div class="viewcode-block" id="GroupBy.mean"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.mean.html#pyspark.pandas.groupby.GroupBy.mean">[docs]</a> <span class="k">def</span> <span class="nf">mean</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute mean of groups, excluding missing values.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> pyspark.pandas.Series or pyspark.pandas.DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [1, 1, 2, 1, 2],</span>
<span class="sd"> ... &#39;B&#39;: [np.nan, 2, 3, 4, 5],</span>
<span class="sd"> ... &#39;C&#39;: [1, 2, 1, 1, 2]}, columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;])</span>
<span class="sd"> Groupby one column and return the mean of the remaining columns in</span>
<span class="sd"> each group.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;).mean().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B C</span>
<span class="sd"> A</span>
<span class="sd"> 1 3.0 1.333333</span>
<span class="sd"> 2 4.0 1.500000</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">mean</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.min"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.min.html#pyspark.pandas.groupby.GroupBy.min">[docs]</a> <span class="k">def</span> <span class="nf">min</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute min of group values.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></div>
<span class="c1"># TODO: sync the doc.</span>
<div class="viewcode-block" id="GroupBy.std"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.std.html#pyspark.pandas.groupby.GroupBy.std">[docs]</a> <span class="k">def</span> <span class="nf">std</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ddof</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute standard deviation of groups, excluding missing values.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> ddof : int, default 1</span>
<span class="sd"> Delta Degrees of Freedom. The divisor used in calculations is N - ddof,</span>
<span class="sd"> where N represents the number of elements.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">assert</span> <span class="n">ddof</span> <span class="ow">in</span> <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">stddev_pop</span> <span class="k">if</span> <span class="n">ddof</span> <span class="o">==</span> <span class="mi">0</span> <span class="k">else</span> <span class="n">F</span><span class="o">.</span><span class="n">stddev_samp</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">True</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.sum"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.sum.html#pyspark.pandas.groupby.GroupBy.sum">[docs]</a> <span class="k">def</span> <span class="nf">sum</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute sum of group values</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">sum</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></div>
<span class="c1"># TODO: sync the doc.</span>
<div class="viewcode-block" id="GroupBy.var"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.var.html#pyspark.pandas.groupby.GroupBy.var">[docs]</a> <span class="k">def</span> <span class="nf">var</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ddof</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute variance of groups, excluding missing values.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> ddof : int, default 1</span>
<span class="sd"> Delta Degrees of Freedom. The divisor used in calculations is N - ddof,</span>
<span class="sd"> where N represents the number of elements.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">assert</span> <span class="n">ddof</span> <span class="ow">in</span> <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">var_pop</span> <span class="k">if</span> <span class="n">ddof</span> <span class="o">==</span> <span class="mi">0</span> <span class="k">else</span> <span class="n">F</span><span class="o">.</span><span class="n">var_samp</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">True</span>
<span class="p">)</span></div>
<span class="c1"># TODO: skipna should be implemented.</span>
<div class="viewcode-block" id="GroupBy.all"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.all.html#pyspark.pandas.groupby.GroupBy.all">[docs]</a> <span class="k">def</span> <span class="nf">all</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns True if all values in the group are truthful, else False.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],</span>
<span class="sd"> ... &#39;B&#39;: [True, True, True, False, False,</span>
<span class="sd"> ... False, None, True, None, False]},</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B</span>
<span class="sd"> 0 1 True</span>
<span class="sd"> 1 1 True</span>
<span class="sd"> 2 2 True</span>
<span class="sd"> 3 2 False</span>
<span class="sd"> 4 3 False</span>
<span class="sd"> 5 3 False</span>
<span class="sd"> 6 4 None</span>
<span class="sd"> 7 4 True</span>
<span class="sd"> 8 5 None</span>
<span class="sd"> 9 5 False</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;).all().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B</span>
<span class="sd"> A</span>
<span class="sd"> 1 True</span>
<span class="sd"> 2 False</span>
<span class="sd"> 3 False</span>
<span class="sd"> 4 True</span>
<span class="sd"> 5 False</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">&quot;boolean&quot;</span><span class="p">),</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">True</span><span class="p">))),</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">False</span>
<span class="p">)</span></div>
<span class="c1"># TODO: skipna should be implemented.</span>
<div class="viewcode-block" id="GroupBy.any"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.any.html#pyspark.pandas.groupby.GroupBy.any">[docs]</a> <span class="k">def</span> <span class="nf">any</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns True if any value in the group is truthful, else False.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],</span>
<span class="sd"> ... &#39;B&#39;: [True, True, True, False, False,</span>
<span class="sd"> ... False, None, True, None, False]},</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B</span>
<span class="sd"> 0 1 True</span>
<span class="sd"> 1 1 True</span>
<span class="sd"> 2 2 True</span>
<span class="sd"> 3 2 False</span>
<span class="sd"> 4 3 False</span>
<span class="sd"> 5 3 False</span>
<span class="sd"> 6 4 None</span>
<span class="sd"> 7 4 True</span>
<span class="sd"> 8 5 None</span>
<span class="sd"> 9 5 False</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;).any().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B</span>
<span class="sd"> A</span>
<span class="sd"> 1 True</span>
<span class="sd"> 2 True</span>
<span class="sd"> 3 False</span>
<span class="sd"> 4 True</span>
<span class="sd"> 5 False</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">&quot;boolean&quot;</span><span class="p">),</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">))),</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">False</span>
<span class="p">)</span></div>
<span class="c1"># TODO: groupby multiply columns should be implemented.</span>
<div class="viewcode-block" id="GroupBy.size"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.size.html#pyspark.pandas.groupby.GroupBy.size">[docs]</a> <span class="k">def</span> <span class="nf">size</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Series</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute group sizes.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [1, 2, 2, 3, 3, 3],</span>
<span class="sd"> ... &#39;B&#39;: [1, 1, 2, 3, 3, 3]},</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B</span>
<span class="sd"> 0 1 1</span>
<span class="sd"> 1 2 1</span>
<span class="sd"> 2 2 2</span>
<span class="sd"> 3 3 3</span>
<span class="sd"> 4 3 3</span>
<span class="sd"> 5 3 3</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;).size().sort_index()</span>
<span class="sd"> A</span>
<span class="sd"> 1 1</span>
<span class="sd"> 2 2</span>
<span class="sd"> 3 3</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;A&#39;, &#39;B&#39;]).size().sort_index()</span>
<span class="sd"> A B</span>
<span class="sd"> 1 1 1</span>
<span class="sd"> 2 1 1</span>
<span class="sd"> 2 1</span>
<span class="sd"> 3 3 3</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> For Series,</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).size().sort_index()</span>
<span class="sd"> A</span>
<span class="sd"> 1 1</span>
<span class="sd"> 2 2</span>
<span class="sd"> 3 3</span>
<span class="sd"> Name: B, dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(df.A).B.size().sort_index()</span>
<span class="sd"> A</span>
<span class="sd"> 1 1</span>
<span class="sd"> 2 2</span>
<span class="sd"> 3 3</span>
<span class="sd"> Name: B, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">groupkeys</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span>
<span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">))]</span>
<span class="n">groupkey_scols</span> <span class="o">=</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="n">groupkey_scols</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">count</span><span class="p">()</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span>
<span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">groupkeys</span><span class="p">],</span>
<span class="n">index_fields</span><span class="o">=</span><span class="p">[</span>
<span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span>
<span class="p">],</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="kc">None</span><span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">&quot;count&quot;</span><span class="p">)],</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div>
<div class="viewcode-block" id="GroupBy.diff"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.diff.html#pyspark.pandas.groupby.GroupBy.diff">[docs]</a> <span class="k">def</span> <span class="nf">diff</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> First discrete difference of element.</span>
<span class="sd"> Calculates the difference of a DataFrame element compared with another element in the</span>
<span class="sd"> DataFrame group (default is the element in the same column of the previous row).</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> periods : int, default 1</span>
<span class="sd"> Periods to shift for calculating difference, accepts negative values.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> diffed : DataFrame or Series</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 2, 3, 4, 5, 6],</span>
<span class="sd"> ... &#39;b&#39;: [1, 1, 2, 3, 5, 8],</span>
<span class="sd"> ... &#39;c&#39;: [1, 4, 9, 16, 25, 36]}, columns=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> a b c</span>
<span class="sd"> 0 1 1 1</span>
<span class="sd"> 1 2 1 4</span>
<span class="sd"> 2 3 2 9</span>
<span class="sd"> 3 4 3 16</span>
<span class="sd"> 4 5 5 25</span>
<span class="sd"> 5 6 8 36</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;b&#39;]).diff().sort_index()</span>
<span class="sd"> a c</span>
<span class="sd"> 0 NaN NaN</span>
<span class="sd"> 1 1.0 3.0</span>
<span class="sd"> 2 NaN NaN</span>
<span class="sd"> 3 NaN NaN</span>
<span class="sd"> 4 NaN NaN</span>
<span class="sd"> 5 NaN NaN</span>
<span class="sd"> Difference with previous column in a group.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;b&#39;])[&#39;a&#39;].diff().sort_index()</span>
<span class="sd"> 0 NaN</span>
<span class="sd"> 1 1.0</span>
<span class="sd"> 2 NaN</span>
<span class="sd"> 3 NaN</span>
<span class="sd"> 4 NaN</span>
<span class="sd"> 5 NaN</span>
<span class="sd"> Name: a, dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_diff</span><span class="p">(</span><span class="n">periods</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span> <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.cumcount"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.cumcount.html#pyspark.pandas.groupby.GroupBy.cumcount">[docs]</a> <span class="k">def</span> <span class="nf">cumcount</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Series</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Number each item in each group from 0 to the length of that group - 1.</span>
<span class="sd"> Essentially this is equivalent to</span>
<span class="sd"> .. code-block:: python</span>
<span class="sd"> self.apply(lambda x: pd.Series(np.arange(len(x)), x.index))</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> ascending : bool, default True</span>
<span class="sd"> If False, number in reverse, from length of group - 1 to 0.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series</span>
<span class="sd"> Sequence number of each element within each group.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame([[&#39;a&#39;], [&#39;a&#39;], [&#39;a&#39;], [&#39;b&#39;], [&#39;b&#39;], [&#39;a&#39;]],</span>
<span class="sd"> ... columns=[&#39;A&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A</span>
<span class="sd"> 0 a</span>
<span class="sd"> 1 a</span>
<span class="sd"> 2 a</span>
<span class="sd"> 3 b</span>
<span class="sd"> 4 b</span>
<span class="sd"> 5 a</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;).cumcount().sort_index()</span>
<span class="sd"> 0 0</span>
<span class="sd"> 1 1</span>
<span class="sd"> 2 2</span>
<span class="sd"> 3 0</span>
<span class="sd"> 4 1</span>
<span class="sd"> 5 3</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;).cumcount(ascending=False).sort_index()</span>
<span class="sd"> 0 3</span>
<span class="sd"> 1 2</span>
<span class="sd"> 2 1</span>
<span class="sd"> 3 1</span>
<span class="sd"> 4 0</span>
<span class="sd"> 5 0</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">ret</span> <span class="o">=</span> <span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="o">.</span><span class="n">rename</span><span class="p">()</span>
<span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="k">lambda</span> <span class="n">_</span><span class="p">:</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span>
<span class="o">.</span><span class="n">_cum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">,</span> <span class="n">ascending</span><span class="o">=</span><span class="n">ascending</span><span class="p">)</span>
<span class="o">-</span> <span class="mi">1</span>
<span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">ret</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span>
<span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div>
<div class="viewcode-block" id="GroupBy.cummax"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.cummax.html#pyspark.pandas.groupby.GroupBy.cummax">[docs]</a> <span class="k">def</span> <span class="nf">cummax</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Cumulative max for each group.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.cummax</span>
<span class="sd"> DataFrame.cummax</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame(</span>
<span class="sd"> ... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],</span>
<span class="sd"> ... columns=list(&#39;ABC&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B C</span>
<span class="sd"> 0 1 NaN 4</span>
<span class="sd"> 1 1 0.1 3</span>
<span class="sd"> 2 1 20.0 2</span>
<span class="sd"> 3 4 10.0 1</span>
<span class="sd"> By default, iterates over rows and finds the sum in each column.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).cummax().sort_index()</span>
<span class="sd"> B C</span>
<span class="sd"> 0 NaN 4</span>
<span class="sd"> 1 0.1 4</span>
<span class="sd"> 2 20.0 4</span>
<span class="sd"> 3 10.0 1</span>
<span class="sd"> It works as below in Series.</span>
<span class="sd"> &gt;&gt;&gt; df.C.groupby(df.A).cummax().sort_index()</span>
<span class="sd"> 0 4</span>
<span class="sd"> 1 4</span>
<span class="sd"> 2 4</span>
<span class="sd"> 3 1</span>
<span class="sd"> Name: C, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_cum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span>
<span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.cummin"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.cummin.html#pyspark.pandas.groupby.GroupBy.cummin">[docs]</a> <span class="k">def</span> <span class="nf">cummin</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Cumulative min for each group.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.cummin</span>
<span class="sd"> DataFrame.cummin</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame(</span>
<span class="sd"> ... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],</span>
<span class="sd"> ... columns=list(&#39;ABC&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B C</span>
<span class="sd"> 0 1 NaN 4</span>
<span class="sd"> 1 1 0.1 3</span>
<span class="sd"> 2 1 20.0 2</span>
<span class="sd"> 3 4 10.0 1</span>
<span class="sd"> By default, iterates over rows and finds the sum in each column.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).cummin().sort_index()</span>
<span class="sd"> B C</span>
<span class="sd"> 0 NaN 4</span>
<span class="sd"> 1 0.1 3</span>
<span class="sd"> 2 0.1 2</span>
<span class="sd"> 3 10.0 1</span>
<span class="sd"> It works as below in Series.</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).cummin().sort_index()</span>
<span class="sd"> 0 NaN</span>
<span class="sd"> 1 0.1</span>
<span class="sd"> 2 0.1</span>
<span class="sd"> 3 10.0</span>
<span class="sd"> Name: B, dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_cum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span>
<span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.cumprod"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.cumprod.html#pyspark.pandas.groupby.GroupBy.cumprod">[docs]</a> <span class="k">def</span> <span class="nf">cumprod</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Cumulative product for each group.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.cumprod</span>
<span class="sd"> DataFrame.cumprod</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame(</span>
<span class="sd"> ... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],</span>
<span class="sd"> ... columns=list(&#39;ABC&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B C</span>
<span class="sd"> 0 1 NaN 4</span>
<span class="sd"> 1 1 0.1 3</span>
<span class="sd"> 2 1 20.0 2</span>
<span class="sd"> 3 4 10.0 1</span>
<span class="sd"> By default, iterates over rows and finds the sum in each column.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).cumprod().sort_index()</span>
<span class="sd"> B C</span>
<span class="sd"> 0 NaN 4</span>
<span class="sd"> 1 0.1 12</span>
<span class="sd"> 2 2.0 24</span>
<span class="sd"> 3 10.0 1</span>
<span class="sd"> It works as below in Series.</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).cumprod().sort_index()</span>
<span class="sd"> 0 NaN</span>
<span class="sd"> 1 0.1</span>
<span class="sd"> 2 2.0</span>
<span class="sd"> 3 10.0</span>
<span class="sd"> Name: B, dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_cumprod</span><span class="p">(</span><span class="kc">True</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span>
<span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.cumsum"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.cumsum.html#pyspark.pandas.groupby.GroupBy.cumsum">[docs]</a> <span class="k">def</span> <span class="nf">cumsum</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Cumulative sum for each group.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.cumsum</span>
<span class="sd"> DataFrame.cumsum</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame(</span>
<span class="sd"> ... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],</span>
<span class="sd"> ... columns=list(&#39;ABC&#39;))</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B C</span>
<span class="sd"> 0 1 NaN 4</span>
<span class="sd"> 1 1 0.1 3</span>
<span class="sd"> 2 1 20.0 2</span>
<span class="sd"> 3 4 10.0 1</span>
<span class="sd"> By default, iterates over rows and finds the sum in each column.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;A&quot;).cumsum().sort_index()</span>
<span class="sd"> B C</span>
<span class="sd"> 0 NaN 4</span>
<span class="sd"> 1 0.1 7</span>
<span class="sd"> 2 20.1 9</span>
<span class="sd"> 3 10.0 1</span>
<span class="sd"> It works as below in Series.</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).cumsum().sort_index()</span>
<span class="sd"> 0 NaN</span>
<span class="sd"> 1 0.1</span>
<span class="sd"> 2 20.1</span>
<span class="sd"> 3 10.0</span>
<span class="sd"> Name: B, dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_cumsum</span><span class="p">(</span><span class="kc">True</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span>
<span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.apply"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.apply.html#pyspark.pandas.groupby.GroupBy.apply">[docs]</a> <span class="k">def</span> <span class="nf">apply</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">]:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Apply function `func` group-wise and combine the results together.</span>
<span class="sd"> The function passed to `apply` must take a DataFrame as its first</span>
<span class="sd"> argument and return a DataFrame. `apply` will</span>
<span class="sd"> then take care of combining the results back together into a single</span>
<span class="sd"> dataframe. `apply` is therefore a highly flexible</span>
<span class="sd"> grouping method.</span>
<span class="sd"> While `apply` is a very flexible method, its downside is that</span>
<span class="sd"> using it can be quite a bit slower than using more specific methods</span>
<span class="sd"> like `agg` or `transform`. pandas-on-Spark offers a wide range of method that will</span>
<span class="sd"> be much faster than using `apply` for their specific purposes, so try to</span>
<span class="sd"> use them before reaching for `apply`.</span>
<span class="sd"> .. note:: this API executes the function once to infer the type which is</span>
<span class="sd"> potentially expensive, for instance, when the dataset is created after</span>
<span class="sd"> aggregations or sorting.</span>
<span class="sd"> To avoid this, specify return type in ``func``, for instance, as below:</span>
<span class="sd"> &gt;&gt;&gt; def pandas_div(x) -&gt; ps.DataFrame[int, [float, float]]:</span>
<span class="sd"> ... return x[[&#39;B&#39;, &#39;C&#39;]] / x[[&#39;B&#39;, &#39;C&#39;]]</span>
<span class="sd"> If the return type is specified, the output column names become</span>
<span class="sd"> `c0, c1, c2 ... cn`. These names are positionally mapped to the returned</span>
<span class="sd"> DataFrame in ``func``.</span>
<span class="sd"> To specify the column names, you can assign them in a NumPy compound type style</span>
<span class="sd"> as below:</span>
<span class="sd"> &gt;&gt;&gt; def pandas_div(x) -&gt; ps.DataFrame[(&quot;index&quot;, int), [(&quot;a&quot;, float), (&quot;b&quot;, float)]]:</span>
<span class="sd"> ... return x[[&#39;B&#39;, &#39;C&#39;]] / x[[&#39;B&#39;, &#39;C&#39;]]</span>
<span class="sd"> &gt;&gt;&gt; pdf = pd.DataFrame({&#39;B&#39;: [1.], &#39;C&#39;: [3.]})</span>
<span class="sd"> &gt;&gt;&gt; def plus_one(x) -&gt; ps.DataFrame[</span>
<span class="sd"> ... (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]:</span>
<span class="sd"> ... return x[[&#39;B&#39;, &#39;C&#39;]] / x[[&#39;B&#39;, &#39;C&#39;]]</span>
<span class="sd"> .. note:: the dataframe within ``func`` is actually a pandas dataframe. Therefore,</span>
<span class="sd"> any pandas API within this function is allowed.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> func : callable</span>
<span class="sd"> A callable that takes a DataFrame as its first argument, and</span>
<span class="sd"> returns a dataframe.</span>
<span class="sd"> *args</span>
<span class="sd"> Positional arguments to pass to func.</span>
<span class="sd"> **kwargs</span>
<span class="sd"> Keyword arguments to pass to func.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> applied : DataFrame or Series</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> aggregate : Apply aggregate function to the GroupBy object.</span>
<span class="sd"> DataFrame.apply : Apply a function to a DataFrame.</span>
<span class="sd"> Series.apply : Apply a function to a Series.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: &#39;a a b&#39;.split(),</span>
<span class="sd"> ... &#39;B&#39;: [1, 2, 3],</span>
<span class="sd"> ... &#39;C&#39;: [4, 6, 5]}, columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;])</span>
<span class="sd"> &gt;&gt;&gt; g = df.groupby(&#39;A&#39;)</span>
<span class="sd"> Notice that ``g`` has two groups, ``a`` and ``b``.</span>
<span class="sd"> Calling `apply` in various ways, we can get different grouping results:</span>
<span class="sd"> Below the functions passed to `apply` takes a DataFrame as</span>
<span class="sd"> its argument and returns a DataFrame. `apply` combines the result for</span>
<span class="sd"> each group together into a new DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; def plus_min(x):</span>
<span class="sd"> ... return x + x.min()</span>
<span class="sd"> &gt;&gt;&gt; g.apply(plus_min).sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> A B C</span>
<span class="sd"> 0 aa 2 8</span>
<span class="sd"> 1 aa 3 10</span>
<span class="sd"> 2 bb 6 10</span>
<span class="sd"> &gt;&gt;&gt; g.apply(sum).sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> A B C</span>
<span class="sd"> A</span>
<span class="sd"> a aa 3 10</span>
<span class="sd"> b b 3 5</span>
<span class="sd"> &gt;&gt;&gt; g.apply(len).sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> A</span>
<span class="sd"> a 2</span>
<span class="sd"> b 1</span>
<span class="sd"> dtype: int64</span>
<span class="sd"> You can specify the type hint and prevent schema inference for better performance.</span>
<span class="sd"> &gt;&gt;&gt; def pandas_div(x) -&gt; ps.DataFrame[int, [float, float]]:</span>
<span class="sd"> ... return x[[&#39;B&#39;, &#39;C&#39;]] / x[[&#39;B&#39;, &#39;C&#39;]]</span>
<span class="sd"> &gt;&gt;&gt; g.apply(pandas_div).sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> c0 c1</span>
<span class="sd"> 0 1.0 1.0</span>
<span class="sd"> 1 1.0 1.0</span>
<span class="sd"> 2 1.0 1.0</span>
<span class="sd"> &gt;&gt;&gt; def pandas_div(x) -&gt; ps.DataFrame[(&quot;index&quot;, int), [(&quot;f1&quot;, float), (&quot;f2&quot;, float)]]:</span>
<span class="sd"> ... return x[[&#39;B&#39;, &#39;C&#39;]] / x[[&#39;B&#39;, &#39;C&#39;]]</span>
<span class="sd"> &gt;&gt;&gt; g.apply(pandas_div).sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> f1 f2</span>
<span class="sd"> index</span>
<span class="sd"> 0 1.0 1.0</span>
<span class="sd"> 1 1.0 1.0</span>
<span class="sd"> 2 1.0 1.0</span>
<span class="sd"> In case of Series, it works as below.</span>
<span class="sd"> &gt;&gt;&gt; def plus_max(x) -&gt; ps.Series[np.int]:</span>
<span class="sd"> ... return x + x.max()</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).apply(plus_max).sort_index() # doctest: +SKIP</span>
<span class="sd"> 0 6</span>
<span class="sd"> 1 3</span>
<span class="sd"> 2 4</span>
<span class="sd"> Name: B, dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; def plus_min(x):</span>
<span class="sd"> ... return x + x.min()</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).apply(plus_min).sort_index()</span>
<span class="sd"> 0 2</span>
<span class="sd"> 1 3</span>
<span class="sd"> 2 6</span>
<span class="sd"> Name: B, dtype: int64</span>
<span class="sd"> You can also return a scalar value as a aggregated value of the group:</span>
<span class="sd"> &gt;&gt;&gt; def plus_length(x) -&gt; np.int:</span>
<span class="sd"> ... return len(x)</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).apply(plus_length).sort_index() # doctest: +SKIP</span>
<span class="sd"> 0 1</span>
<span class="sd"> 1 2</span>
<span class="sd"> Name: B, dtype: int64</span>
<span class="sd"> The extra arguments to the function can be passed as below.</span>
<span class="sd"> &gt;&gt;&gt; def calculation(x, y, z) -&gt; np.int:</span>
<span class="sd"> ... return len(x) + y * z</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).apply(calculation, 5, z=10).sort_index() # doctest: +SKIP</span>
<span class="sd"> 0 51</span>
<span class="sd"> 1 52</span>
<span class="sd"> Name: B, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">callable</span><span class="p">(</span><span class="n">func</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;</span><span class="si">%s</span><span class="s2"> object is not callable&quot;</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="n">spec</span> <span class="o">=</span> <span class="n">inspect</span><span class="o">.</span><span class="n">getfullargspec</span><span class="p">(</span><span class="n">func</span><span class="p">)</span>
<span class="n">return_sig</span> <span class="o">=</span> <span class="n">spec</span><span class="o">.</span><span class="n">annotations</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;return&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
<span class="n">should_infer_schema</span> <span class="o">=</span> <span class="n">return_sig</span> <span class="ow">is</span> <span class="kc">None</span>
<span class="n">should_retain_index</span> <span class="o">=</span> <span class="n">should_infer_schema</span>
<span class="n">is_series_groupby</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">SeriesGroupBy</span><span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span><span class="p">:</span>
<span class="n">agg_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_column_labels_to_exclude</span>
<span class="p">]</span>
<span class="n">psdf</span><span class="p">,</span> <span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">groupkey_names</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_prepare_group_map_apply</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">agg_columns</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span>
<span class="n">name</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span>
<span class="n">pandas_apply</span> <span class="o">=</span> <span class="n">_builtin_table</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">func</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">f</span> <span class="o">=</span> <span class="n">_builtin_table</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">func</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">pandas_apply</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="o">*</span><span class="n">a</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">k</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Any</span><span class="p">:</span>
<span class="k">return</span> <span class="n">f</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">),</span> <span class="o">*</span><span class="n">a</span><span class="p">,</span> <span class="o">**</span><span class="n">k</span><span class="p">)</span>
<span class="n">should_return_series</span> <span class="o">=</span> <span class="kc">False</span>
<span class="k">if</span> <span class="n">should_infer_schema</span><span class="p">:</span>
<span class="c1"># Here we execute with the first 1000 to get the return type.</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;If the type hints is not specified for `grouby.apply`, &quot;</span>
<span class="s2">&quot;it is expensive to infer the data type internally.&quot;</span>
<span class="p">)</span>
<span class="n">limit</span> <span class="o">=</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">&quot;compute.shortcut_limit&quot;</span><span class="p">)</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">limit</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span>
<span class="n">groupkeys</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">pdf</span><span class="p">[</span><span class="n">groupkey_name</span><span class="p">]</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">groupkey_name</span><span class="p">,</span> <span class="n">psser</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span>
<span class="p">]</span>
<span class="n">grouped</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">)</span>
<span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span>
<span class="n">pser_or_pdf</span> <span class="o">=</span> <span class="n">grouped</span><span class="p">[</span><span class="n">name</span><span class="p">]</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">pandas_apply</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">pser_or_pdf</span> <span class="o">=</span> <span class="n">grouped</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">pandas_apply</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="n">psser_or_psdf</span> <span class="o">=</span> <span class="n">ps</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span><span class="n">pser_or_pdf</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="n">limit</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser_or_psdf</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="ow">and</span> <span class="n">is_series_groupby</span><span class="p">:</span>
<span class="n">psser_or_psdf</span> <span class="o">=</span> <span class="n">psser_or_psdf</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">SeriesGroupBy</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">],</span> <span class="n">psser_or_psdf</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">grouped</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">with</span> <span class="n">warnings</span><span class="o">.</span><span class="n">catch_warnings</span><span class="p">():</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">simplefilter</span><span class="p">(</span><span class="s2">&quot;always&quot;</span><span class="p">)</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;The amount of data for return type inference might not be large enough. &quot;</span>
<span class="s2">&quot;Consider increasing an option `compute.shortcut_limit`.&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser_or_psdf</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="n">should_return_series</span> <span class="o">=</span> <span class="kc">True</span>
<span class="n">psdf_from_pandas</span> <span class="o">=</span> <span class="n">psser_or_psdf</span><span class="o">.</span><span class="n">_psdf</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">psdf_from_pandas</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">psser_or_psdf</span><span class="p">)</span>
<span class="n">index_fields</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">field</span><span class="o">.</span><span class="n">normalize_spark_type</span><span class="p">()</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span>
<span class="p">]</span>
<span class="n">data_fields</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">field</span><span class="o">.</span><span class="n">normalize_spark_type</span><span class="p">()</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span>
<span class="p">]</span>
<span class="n">return_schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span><span class="n">field</span><span class="o">.</span><span class="n">struct_field</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">index_fields</span> <span class="o">+</span> <span class="n">data_fields</span><span class="p">])</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">return_type</span> <span class="o">=</span> <span class="n">infer_return_type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_series_groupby</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">return_type</span><span class="p">,</span> <span class="n">SeriesType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Series as a return type hint at frame groupby is not supported &quot;</span>
<span class="s2">&quot;currently; however got [</span><span class="si">%s</span><span class="s2">]. Use DataFrame type hint instead.&quot;</span> <span class="o">%</span> <span class="n">return_sig</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">return_type</span><span class="p">,</span> <span class="n">DataFrameType</span><span class="p">):</span>
<span class="n">data_fields</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">data_fields</span>
<span class="n">return_schema</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">spark_type</span>
<span class="n">index_fields</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">index_fields</span>
<span class="n">should_retain_index</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">index_fields</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span>
<span class="n">psdf_from_pandas</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">should_return_series</span> <span class="o">=</span> <span class="kc">True</span>
<span class="n">dtype</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">SeriesType</span><span class="p">,</span> <span class="n">ScalarType</span><span class="p">],</span> <span class="n">return_type</span><span class="p">)</span><span class="o">.</span><span class="n">dtype</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">SeriesType</span><span class="p">,</span> <span class="n">ScalarType</span><span class="p">],</span> <span class="n">return_type</span><span class="p">)</span><span class="o">.</span><span class="n">spark_type</span>
<span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span>
<span class="n">data_fields</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">InternalField</span><span class="p">(</span>
<span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">struct_field</span><span class="o">=</span><span class="n">StructField</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">,</span> <span class="n">dataType</span><span class="o">=</span><span class="n">spark_type</span><span class="p">)</span>
<span class="p">)</span>
<span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">data_fields</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">InternalField</span><span class="p">(</span>
<span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span>
<span class="n">struct_field</span><span class="o">=</span><span class="n">StructField</span><span class="p">(</span>
<span class="n">name</span><span class="o">=</span><span class="n">SPARK_DEFAULT_SERIES_NAME</span><span class="p">,</span> <span class="n">dataType</span><span class="o">=</span><span class="n">spark_type</span>
<span class="p">),</span>
<span class="p">)</span>
<span class="p">]</span>
<span class="n">return_schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span><span class="n">field</span><span class="o">.</span><span class="n">struct_field</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">data_fields</span><span class="p">])</span>
<span class="k">def</span> <span class="nf">pandas_groupby_apply</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span>
<span class="n">pdf_or_ser</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)[</span><span class="n">name</span><span class="p">]</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">pandas_apply</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">pdf_or_ser</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">pandas_apply</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">if</span> <span class="n">should_return_series</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pdf_or_ser</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="n">pdf_or_ser</span> <span class="o">=</span> <span class="n">pdf_or_ser</span><span class="o">.</span><span class="n">stack</span><span class="p">()</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pdf_or_ser</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">pdf_or_ser</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pdf_or_ser</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_group_map_apply</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="n">pandas_groupby_apply</span><span class="p">,</span>
<span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">],</span>
<span class="n">return_schema</span><span class="p">,</span>
<span class="n">retain_index</span><span class="o">=</span><span class="n">should_retain_index</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">should_retain_index</span><span class="p">:</span>
<span class="c1"># If schema is inferred, we can restore indexes too.</span>
<span class="k">if</span> <span class="n">psdf_from_pandas</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_fields</span><span class="o">=</span><span class="n">index_fields</span><span class="p">,</span> <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">index_names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Optional</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="o">...</span><span class="p">]]]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">index_spark_columns</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_field</span><span class="o">.</span><span class="n">struct_field</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">index_field</span> <span class="ow">in</span> <span class="n">index_fields</span>
<span class="p">]</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">any</span><span class="p">(</span>
<span class="p">[</span>
<span class="n">SPARK_INDEX_NAME_PATTERN</span><span class="o">.</span><span class="n">match</span><span class="p">(</span><span class="n">index_field</span><span class="o">.</span><span class="n">struct_field</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">index_field</span> <span class="ow">in</span> <span class="n">index_fields</span>
<span class="p">]</span>
<span class="p">):</span>
<span class="n">index_names</span> <span class="o">=</span> <span class="p">[(</span><span class="n">index_field</span><span class="o">.</span><span class="n">struct_field</span><span class="o">.</span><span class="n">name</span><span class="p">,)</span> <span class="k">for</span> <span class="n">index_field</span> <span class="ow">in</span> <span class="n">index_fields</span><span class="p">]</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span>
<span class="n">index_fields</span><span class="o">=</span><span class="n">index_fields</span><span class="p">,</span>
<span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># Otherwise, it loses index.</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">should_return_series</span><span class="p">:</span>
<span class="n">psser</span> <span class="o">=</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span>
<span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span>
<span class="n">psser</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">SeriesGroupBy</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">return</span> <span class="n">psser</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div>
<span class="c1"># TODO: implement &#39;dropna&#39; parameter</span>
<div class="viewcode-block" id="GroupBy.filter"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.filter.html#pyspark.pandas.groupby.GroupBy.filter">[docs]</a> <span class="k">def</span> <span class="nf">filter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">FrameLike</span><span class="p">],</span> <span class="n">FrameLike</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return a copy of a DataFrame excluding elements from groups that</span>
<span class="sd"> do not satisfy the boolean criterion specified by func.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> f : function</span>
<span class="sd"> Function to apply to each subframe. Should return True or False.</span>
<span class="sd"> dropna : Drop groups that do not pass the filter. True by default;</span>
<span class="sd"> if False, groups that evaluate False are filled with NaNs.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> filtered : DataFrame or Series</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Each subframe is endowed the attribute &#39;name&#39; in case you need to know</span>
<span class="sd"> which group you are working on.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39; : [&#39;foo&#39;, &#39;bar&#39;, &#39;foo&#39;, &#39;bar&#39;,</span>
<span class="sd"> ... &#39;foo&#39;, &#39;bar&#39;],</span>
<span class="sd"> ... &#39;B&#39; : [1, 2, 3, 4, 5, 6],</span>
<span class="sd"> ... &#39;C&#39; : [2.0, 5., 8., 1., 2., 9.]}, columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;])</span>
<span class="sd"> &gt;&gt;&gt; grouped = df.groupby(&#39;A&#39;)</span>
<span class="sd"> &gt;&gt;&gt; grouped.filter(lambda x: x[&#39;B&#39;].mean() &gt; 3.)</span>
<span class="sd"> A B C</span>
<span class="sd"> 1 bar 2 5.0</span>
<span class="sd"> 3 bar 4 1.0</span>
<span class="sd"> 5 bar 6 9.0</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).filter(lambda x: x.mean() &gt; 3.)</span>
<span class="sd"> 1 2</span>
<span class="sd"> 3 4</span>
<span class="sd"> 5 6</span>
<span class="sd"> Name: B, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">callable</span><span class="p">(</span><span class="n">func</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;</span><span class="si">%s</span><span class="s2"> object is not callable&quot;</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="n">is_series_groupby</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">SeriesGroupBy</span><span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span><span class="p">:</span>
<span class="n">agg_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_column_labels_to_exclude</span>
<span class="p">]</span>
<span class="n">data_schema</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">psdf</span><span class="p">[</span><span class="n">agg_columns</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="o">*</span><span class="n">HIDDEN_COLUMNS</span><span class="p">)</span><span class="o">.</span><span class="n">schema</span>
<span class="p">)</span>
<span class="n">psdf</span><span class="p">,</span> <span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">groupkey_names</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_prepare_group_map_apply</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">agg_columns</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">pandas_filter</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)[</span><span class="n">pdf</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]]</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">func</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">f</span> <span class="o">=</span> <span class="n">_builtin_table</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">func</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">wrapped_func</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="k">return</span> <span class="n">f</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">))</span>
<span class="k">def</span> <span class="nf">pandas_filter</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">wrapped_func</span><span class="p">)</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_group_map_apply</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="n">pandas_filter</span><span class="p">,</span>
<span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">],</span>
<span class="n">data_schema</span><span class="p">,</span>
<span class="n">retain_index</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">[</span><span class="n">agg_columns</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">sdf</span><span class="p">))</span>
<span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">FrameLike</span><span class="p">,</span> <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">FrameLike</span><span class="p">,</span> <span class="n">psdf</span><span class="p">)</span></div>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_prepare_group_map_apply</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">groupkeys</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span> <span class="n">agg_columns</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">]</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]:</span>
<span class="n">groupkey_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="s2">&quot;__groupkey_</span><span class="si">{}</span><span class="s2">__&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">))</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">))</span>
<span class="p">]</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[[</span><span class="n">s</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_labels</span><span class="p">)]</span> <span class="o">+</span> <span class="n">agg_columns</span><span class="p">]</span>
<span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">label</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">1</span> <span class="k">else</span> <span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">]</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="p">),</span> <span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">groupkey_names</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_spark_group_map_apply</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span>
<span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span>
<span class="n">groupkeys_scols</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">],</span>
<span class="n">return_schema</span><span class="p">:</span> <span class="n">StructType</span><span class="p">,</span>
<span class="n">retain_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">SparkDataFrame</span><span class="p">:</span>
<span class="n">output_func</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_make_pandas_df_builder_func</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">func</span><span class="p">,</span> <span class="n">return_schema</span><span class="p">,</span> <span class="n">retain_index</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="o">*</span><span class="n">HIDDEN_COLUMNS</span><span class="p">)</span>
<span class="k">return</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkeys_scols</span><span class="p">)</span><span class="o">.</span><span class="n">applyInPandas</span><span class="p">(</span><span class="n">output_func</span><span class="p">,</span> <span class="n">return_schema</span><span class="p">)</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_make_pandas_df_builder_func</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span>
<span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span>
<span class="n">return_schema</span><span class="p">:</span> <span class="n">StructType</span><span class="p">,</span>
<span class="n">retain_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">]:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Creates a function that can be used inside the pandas UDF. This function can construct</span>
<span class="sd"> the same pandas DataFrame as if the pandas-on-Spark DataFrame is collected to driver side.</span>
<span class="sd"> The index, column labels, etc. are re-constructed within the function.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.utils</span> <span class="kn">import</span> <span class="n">is_timestamp_ntz_preferred</span>
<span class="n">arguments_for_restore_index</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">arguments_for_restore_index</span>
<span class="n">prefer_timestamp_ntz</span> <span class="o">=</span> <span class="n">is_timestamp_ntz_preferred</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">rename_output</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="o">.</span><span class="n">restore_index</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">copy</span><span class="p">(),</span> <span class="o">**</span><span class="n">arguments_for_restore_index</span><span class="p">)</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">func</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span>
<span class="c1"># If schema should be inferred, we don&#39;t restore index. pandas seems restoring</span>
<span class="c1"># the index in some cases.</span>
<span class="c1"># When Spark output type is specified, without executing it, we don&#39;t know</span>
<span class="c1"># if we should restore the index or not. For instance, see the example in</span>
<span class="c1"># https://github.com/pyspark.pandas/issues/628.</span>
<span class="n">pdf</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="o">.</span><span class="n">prepare_pandas_frame</span><span class="p">(</span>
<span class="n">pdf</span><span class="p">,</span> <span class="n">retain_index</span><span class="o">=</span><span class="n">retain_index</span><span class="p">,</span> <span class="n">prefer_timestamp_ntz</span><span class="o">=</span><span class="n">prefer_timestamp_ntz</span>
<span class="p">)</span>
<span class="c1"># Just positionally map the column names to given schema&#39;s.</span>
<span class="n">pdf</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">return_schema</span><span class="o">.</span><span class="n">names</span>
<span class="k">return</span> <span class="n">pdf</span>
<span class="k">return</span> <span class="n">rename_output</span>
<div class="viewcode-block" id="GroupBy.rank"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.rank.html#pyspark.pandas.groupby.GroupBy.rank">[docs]</a> <span class="k">def</span> <span class="nf">rank</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">method</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;average&quot;</span><span class="p">,</span> <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Provide the rank of values within each group.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> method : {&#39;average&#39;, &#39;min&#39;, &#39;max&#39;, &#39;first&#39;, &#39;dense&#39;}, default &#39;average&#39;</span>
<span class="sd"> * average: average rank of group</span>
<span class="sd"> * min: lowest rank in group</span>
<span class="sd"> * max: highest rank in group</span>
<span class="sd"> * first: ranks assigned in order they appear in the array</span>
<span class="sd"> * dense: like &#39;min&#39;, but rank always increases by 1 between groups</span>
<span class="sd"> ascending : boolean, default True</span>
<span class="sd"> False for ranks by high (1) to low (N)</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame with ranking of values within each group</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({</span>
<span class="sd"> ... &#39;a&#39;: [1, 1, 1, 2, 2, 2, 3, 3, 3],</span>
<span class="sd"> ... &#39;b&#39;: [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> a b</span>
<span class="sd"> 0 1 1</span>
<span class="sd"> 1 1 2</span>
<span class="sd"> 2 1 2</span>
<span class="sd"> 3 2 2</span>
<span class="sd"> 4 2 3</span>
<span class="sd"> 5 2 3</span>
<span class="sd"> 6 3 3</span>
<span class="sd"> 7 3 4</span>
<span class="sd"> 8 3 4</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&quot;a&quot;).rank().sort_index()</span>
<span class="sd"> b</span>
<span class="sd"> 0 1.0</span>
<span class="sd"> 1 2.5</span>
<span class="sd"> 2 2.5</span>
<span class="sd"> 3 1.0</span>
<span class="sd"> 4 2.5</span>
<span class="sd"> 5 2.5</span>
<span class="sd"> 6 1.0</span>
<span class="sd"> 7 2.5</span>
<span class="sd"> 8 2.5</span>
<span class="sd"> &gt;&gt;&gt; df.b.groupby(df.a).rank(method=&#39;max&#39;).sort_index()</span>
<span class="sd"> 0 1.0</span>
<span class="sd"> 1 3.0</span>
<span class="sd"> 2 3.0</span>
<span class="sd"> 3 1.0</span>
<span class="sd"> 4 3.0</span>
<span class="sd"> 5 3.0</span>
<span class="sd"> 6 1.0</span>
<span class="sd"> 7 3.0</span>
<span class="sd"> 8 3.0</span>
<span class="sd"> Name: b, dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_rank</span><span class="p">(</span><span class="n">method</span><span class="p">,</span> <span class="n">ascending</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span>
<span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span></div>
<span class="c1"># TODO: add axis parameter</span>
<div class="viewcode-block" id="GroupBy.idxmax"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.idxmax.html#pyspark.pandas.groupby.GroupBy.idxmax">[docs]</a> <span class="k">def</span> <span class="nf">idxmax</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return index of first occurrence of maximum over requested axis in group.</span>
<span class="sd"> NA/null values are excluded.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> skipna : boolean, default True</span>
<span class="sd"> Exclude NA/null values. If an entire row/column is NA, the result will be NA.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.idxmax</span>
<span class="sd"> DataFrame.idxmax</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 1, 2, 2, 3],</span>
<span class="sd"> ... &#39;b&#39;: [1, 2, 3, 4, 5],</span>
<span class="sd"> ... &#39;c&#39;: [5, 4, 3, 2, 1]}, columns=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;a&#39;])[&#39;b&#39;].idxmax().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> a</span>
<span class="sd"> 1 1</span>
<span class="sd"> 2 3</span>
<span class="sd"> 3 4</span>
<span class="sd"> Name: b, dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;a&#39;]).idxmax().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> b c</span>
<span class="sd"> a</span>
<span class="sd"> 1 1 0</span>
<span class="sd"> 2 3 2</span>
<span class="sd"> 3 4 4</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;idxmax only support one-level index now&quot;</span><span class="p">)</span>
<span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;__groupkey_</span><span class="si">{}</span><span class="s2">__&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span>
<span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">):</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span>
<span class="n">index</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">stat_exprs</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">scol</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_scols</span><span class="p">):</span>
<span class="n">name</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">if</span> <span class="n">skipna</span><span class="p">:</span>
<span class="n">order_column</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">desc_nulls_last</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">order_column</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">desc_nulls_first</span><span class="p">()</span>
<span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span>
<span class="n">order_column</span><span class="p">,</span> <span class="n">NATURAL_ORDER_COLUMN_NAME</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span>
<span class="n">name</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">,</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index</span><span class="p">))</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span>
<span class="p">)</span>
<span class="n">stat_exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">name</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">))</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">stat_exprs</span><span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span>
<span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">],</span>
<span class="n">index_fields</span><span class="o">=</span><span class="p">[</span>
<span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span>
<span class="p">],</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span>
<span class="p">],</span>
<span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cleanup_and_return</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div>
<span class="c1"># TODO: add axis parameter</span>
<div class="viewcode-block" id="GroupBy.idxmin"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.idxmin.html#pyspark.pandas.groupby.GroupBy.idxmin">[docs]</a> <span class="k">def</span> <span class="nf">idxmin</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return index of first occurrence of minimum over requested axis in group.</span>
<span class="sd"> NA/null values are excluded.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> skipna : boolean, default True</span>
<span class="sd"> Exclude NA/null values. If an entire row/column is NA, the result will be NA.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.idxmin</span>
<span class="sd"> DataFrame.idxmin</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 1, 2, 2, 3],</span>
<span class="sd"> ... &#39;b&#39;: [1, 2, 3, 4, 5],</span>
<span class="sd"> ... &#39;c&#39;: [5, 4, 3, 2, 1]}, columns=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;a&#39;])[&#39;b&#39;].idxmin().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> a</span>
<span class="sd"> 1 0</span>
<span class="sd"> 2 2</span>
<span class="sd"> 3 4</span>
<span class="sd"> Name: b, dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;a&#39;]).idxmin().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> b c</span>
<span class="sd"> a</span>
<span class="sd"> 1 0 1</span>
<span class="sd"> 2 2 3</span>
<span class="sd"> 3 4 4</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;idxmin only support one-level index now&quot;</span><span class="p">)</span>
<span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;__groupkey_</span><span class="si">{}</span><span class="s2">__&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span>
<span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">):</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span>
<span class="n">index</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">stat_exprs</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">scol</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_scols</span><span class="p">):</span>
<span class="n">name</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="k">if</span> <span class="n">skipna</span><span class="p">:</span>
<span class="n">order_column</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">asc_nulls_last</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">order_column</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">asc_nulls_first</span><span class="p">()</span>
<span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span>
<span class="n">order_column</span><span class="p">,</span> <span class="n">NATURAL_ORDER_COLUMN_NAME</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span>
<span class="n">name</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">,</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index</span><span class="p">))</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span>
<span class="p">)</span>
<span class="n">stat_exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">name</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">))</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">stat_exprs</span><span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span>
<span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">],</span>
<span class="n">index_fields</span><span class="o">=</span><span class="p">[</span>
<span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span>
<span class="p">],</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span>
<span class="p">],</span>
<span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cleanup_and_return</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div>
<div class="viewcode-block" id="GroupBy.fillna"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.fillna.html#pyspark.pandas.groupby.GroupBy.fillna">[docs]</a> <span class="k">def</span> <span class="nf">fillna</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">method</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;Fill NA/NaN values in group.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> value : scalar, dict, Series</span>
<span class="sd"> Value to use to fill holes. alternately a dict/Series of values</span>
<span class="sd"> specifying which value to use for each column.</span>
<span class="sd"> DataFrame is not supported.</span>
<span class="sd"> method : {&#39;backfill&#39;, &#39;bfill&#39;, &#39;pad&#39;, &#39;ffill&#39;, None}, default None</span>
<span class="sd"> Method to use for filling holes in reindexed Series pad / ffill: propagate last valid</span>
<span class="sd"> observation forward to next valid backfill / bfill:</span>
<span class="sd"> use NEXT valid observation to fill gap</span>
<span class="sd"> axis : {0 or `index`}</span>
<span class="sd"> 1 and `columns` are not supported.</span>
<span class="sd"> inplace : boolean, default False</span>
<span class="sd"> Fill in place (do not create a new object)</span>
<span class="sd"> limit : int, default None</span>
<span class="sd"> If method is specified, this is the maximum number of consecutive NaN values to</span>
<span class="sd"> forward/backward fill. In other words, if there is a gap with more than this number of</span>
<span class="sd"> consecutive NaNs, it will only be partially filled. If method is not specified,</span>
<span class="sd"> this is the maximum number of entries along the entire axis where NaNs will be filled.</span>
<span class="sd"> Must be greater than 0 if not None</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> DataFrame with NA entries filled.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({</span>
<span class="sd"> ... &#39;A&#39;: [1, 1, 2, 2],</span>
<span class="sd"> ... &#39;B&#39;: [2, 4, None, 3],</span>
<span class="sd"> ... &#39;C&#39;: [None, None, None, 1],</span>
<span class="sd"> ... &#39;D&#39;: [0, 1, 5, 4]</span>
<span class="sd"> ... },</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;, &#39;D&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B C D</span>
<span class="sd"> 0 1 2.0 NaN 0</span>
<span class="sd"> 1 1 4.0 NaN 1</span>
<span class="sd"> 2 2 NaN NaN 5</span>
<span class="sd"> 3 2 3.0 1.0 4</span>
<span class="sd"> We can also propagate non-null values forward or backward in group.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;A&#39;])[&#39;B&#39;].fillna(method=&#39;ffill&#39;).sort_index()</span>
<span class="sd"> 0 2.0</span>
<span class="sd"> 1 4.0</span>
<span class="sd"> 2 NaN</span>
<span class="sd"> 3 3.0</span>
<span class="sd"> Name: B, dtype: float64</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;A&#39;]).fillna(method=&#39;bfill&#39;).sort_index()</span>
<span class="sd"> B C D</span>
<span class="sd"> 0 2.0 NaN 0</span>
<span class="sd"> 1 4.0 NaN 1</span>
<span class="sd"> 2 3.0 1.0 5</span>
<span class="sd"> 3 3.0 1.0 4</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_fillna</span><span class="p">(</span>
<span class="n">value</span><span class="o">=</span><span class="n">value</span><span class="p">,</span> <span class="n">method</span><span class="o">=</span><span class="n">method</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span>
<span class="p">),</span>
<span class="n">should_resolve</span><span class="o">=</span><span class="p">(</span><span class="n">method</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">),</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.bfill"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.bfill.html#pyspark.pandas.groupby.GroupBy.bfill">[docs]</a> <span class="k">def</span> <span class="nf">bfill</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Synonym for `DataFrame.fillna()` with ``method=`bfill```.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {0 or `index`}</span>
<span class="sd"> 1 and `columns` are not supported.</span>
<span class="sd"> inplace : boolean, default False</span>
<span class="sd"> Fill in place (do not create a new object)</span>
<span class="sd"> limit : int, default None</span>
<span class="sd"> If method is specified, this is the maximum number of consecutive NaN values to</span>
<span class="sd"> forward/backward fill. In other words, if there is a gap with more than this number of</span>
<span class="sd"> consecutive NaNs, it will only be partially filled. If method is not specified,</span>
<span class="sd"> this is the maximum number of entries along the entire axis where NaNs will be filled.</span>
<span class="sd"> Must be greater than 0 if not None</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> DataFrame with NA entries filled.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({</span>
<span class="sd"> ... &#39;A&#39;: [1, 1, 2, 2],</span>
<span class="sd"> ... &#39;B&#39;: [2, 4, None, 3],</span>
<span class="sd"> ... &#39;C&#39;: [None, None, None, 1],</span>
<span class="sd"> ... &#39;D&#39;: [0, 1, 5, 4]</span>
<span class="sd"> ... },</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;, &#39;D&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B C D</span>
<span class="sd"> 0 1 2.0 NaN 0</span>
<span class="sd"> 1 1 4.0 NaN 1</span>
<span class="sd"> 2 2 NaN NaN 5</span>
<span class="sd"> 3 2 3.0 1.0 4</span>
<span class="sd"> Propagate non-null values backward.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;A&#39;]).bfill().sort_index()</span>
<span class="sd"> B C D</span>
<span class="sd"> 0 2.0 NaN 0</span>
<span class="sd"> 1 4.0 NaN 1</span>
<span class="sd"> 2 3.0 1.0 5</span>
<span class="sd"> 3 3.0 1.0 4</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="n">method</span><span class="o">=</span><span class="s2">&quot;bfill&quot;</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span></div>
<span class="n">backfill</span> <span class="o">=</span> <span class="n">bfill</span>
<div class="viewcode-block" id="GroupBy.ffill"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.ffill.html#pyspark.pandas.groupby.GroupBy.ffill">[docs]</a> <span class="k">def</span> <span class="nf">ffill</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Synonym for `DataFrame.fillna()` with ``method=`ffill```.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> axis : {0 or `index`}</span>
<span class="sd"> 1 and `columns` are not supported.</span>
<span class="sd"> inplace : boolean, default False</span>
<span class="sd"> Fill in place (do not create a new object)</span>
<span class="sd"> limit : int, default None</span>
<span class="sd"> If method is specified, this is the maximum number of consecutive NaN values to</span>
<span class="sd"> forward/backward fill. In other words, if there is a gap with more than this number of</span>
<span class="sd"> consecutive NaNs, it will only be partially filled. If method is not specified,</span>
<span class="sd"> this is the maximum number of entries along the entire axis where NaNs will be filled.</span>
<span class="sd"> Must be greater than 0 if not None</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> DataFrame with NA entries filled.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({</span>
<span class="sd"> ... &#39;A&#39;: [1, 1, 2, 2],</span>
<span class="sd"> ... &#39;B&#39;: [2, 4, None, 3],</span>
<span class="sd"> ... &#39;C&#39;: [None, None, None, 1],</span>
<span class="sd"> ... &#39;D&#39;: [0, 1, 5, 4]</span>
<span class="sd"> ... },</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;, &#39;D&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B C D</span>
<span class="sd"> 0 1 2.0 NaN 0</span>
<span class="sd"> 1 1 4.0 NaN 1</span>
<span class="sd"> 2 2 NaN NaN 5</span>
<span class="sd"> 3 2 3.0 1.0 4</span>
<span class="sd"> Propagate non-null values forward.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;A&#39;]).ffill().sort_index()</span>
<span class="sd"> B C D</span>
<span class="sd"> 0 2.0 NaN 0</span>
<span class="sd"> 1 4.0 NaN 1</span>
<span class="sd"> 2 NaN NaN 5</span>
<span class="sd"> 3 3.0 1.0 4</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="n">method</span><span class="o">=</span><span class="s2">&quot;ffill&quot;</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span></div>
<span class="n">pad</span> <span class="o">=</span> <span class="n">ffill</span>
<span class="k">def</span> <span class="nf">_limit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">asc</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Private function for tail and head.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span><span class="p">:</span>
<span class="n">agg_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_column_labels_to_exclude</span>
<span class="p">]</span>
<span class="n">psdf</span><span class="p">,</span> <span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_prepare_group_map_apply</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span>
<span class="n">agg_columns</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">groupkey_scols</span> <span class="o">=</span> <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span>
<span class="n">tmp_col</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">&quot;__row_number__&quot;</span><span class="p">)</span>
<span class="c1"># This part is handled differently depending on whether it is a tail or a head.</span>
<span class="n">window</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_scols</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span><span class="o">.</span><span class="n">asc</span><span class="p">())</span>
<span class="k">if</span> <span class="n">asc</span>
<span class="k">else</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_scols</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span><span class="o">.</span><span class="n">desc</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">tmp_col</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">))</span>
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">tmp_col</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="n">n</span><span class="p">)</span>
<span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">tmp_col</span><span class="p">)</span>
<span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">sdf</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cleanup_and_return</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">))</span>
<div class="viewcode-block" id="GroupBy.head"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.head.html#pyspark.pandas.groupby.GroupBy.head">[docs]</a> <span class="k">def</span> <span class="nf">head</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return first n rows of each group.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame or Series</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 1, 1, 1, 2, 2, 2, 3, 3, 3],</span>
<span class="sd"> ... &#39;b&#39;: [2, 3, 1, 4, 6, 9, 8, 10, 7, 5],</span>
<span class="sd"> ... &#39;c&#39;: [3, 5, 2, 5, 1, 2, 6, 4, 3, 6]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;],</span>
<span class="sd"> ... index=[7, 2, 4, 1, 3, 4, 9, 10, 5, 6])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> a b c</span>
<span class="sd"> 7 1 2 3</span>
<span class="sd"> 2 1 3 5</span>
<span class="sd"> 4 1 1 2</span>
<span class="sd"> 1 1 4 5</span>
<span class="sd"> 3 2 6 1</span>
<span class="sd"> 4 2 9 2</span>
<span class="sd"> 9 2 8 6</span>
<span class="sd"> 10 3 10 4</span>
<span class="sd"> 5 3 7 3</span>
<span class="sd"> 6 3 5 6</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;a&#39;).head(2).sort_index()</span>
<span class="sd"> a b c</span>
<span class="sd"> 2 1 3 5</span>
<span class="sd"> 3 2 6 1</span>
<span class="sd"> 4 2 9 2</span>
<span class="sd"> 5 3 7 3</span>
<span class="sd"> 7 1 2 3</span>
<span class="sd"> 10 3 10 4</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;a&#39;)[&#39;b&#39;].head(2).sort_index()</span>
<span class="sd"> 2 3</span>
<span class="sd"> 3 6</span>
<span class="sd"> 4 9</span>
<span class="sd"> 5 7</span>
<span class="sd"> 7 2</span>
<span class="sd"> 10 10</span>
<span class="sd"> Name: b, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_limit</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="n">asc</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.tail"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.tail.html#pyspark.pandas.groupby.GroupBy.tail">[docs]</a> <span class="k">def</span> <span class="nf">tail</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return last n rows of each group.</span>
<span class="sd"> Similar to `.apply(lambda x: x.tail(n))`, but it returns a subset of rows from</span>
<span class="sd"> the original DataFrame with original index and order preserved (`as_index` flag is ignored).</span>
<span class="sd"> Does not work for negative values of n.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame or Series</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 1, 1, 1, 2, 2, 2, 3, 3, 3],</span>
<span class="sd"> ... &#39;b&#39;: [2, 3, 1, 4, 6, 9, 8, 10, 7, 5],</span>
<span class="sd"> ... &#39;c&#39;: [3, 5, 2, 5, 1, 2, 6, 4, 3, 6]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;],</span>
<span class="sd"> ... index=[7, 2, 3, 1, 3, 4, 9, 10, 5, 6])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> a b c</span>
<span class="sd"> 7 1 2 3</span>
<span class="sd"> 2 1 3 5</span>
<span class="sd"> 3 1 1 2</span>
<span class="sd"> 1 1 4 5</span>
<span class="sd"> 3 2 6 1</span>
<span class="sd"> 4 2 9 2</span>
<span class="sd"> 9 2 8 6</span>
<span class="sd"> 10 3 10 4</span>
<span class="sd"> 5 3 7 3</span>
<span class="sd"> 6 3 5 6</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;a&#39;).tail(2).sort_index()</span>
<span class="sd"> a b c</span>
<span class="sd"> 1 1 4 5</span>
<span class="sd"> 3 1 1 2</span>
<span class="sd"> 4 2 9 2</span>
<span class="sd"> 5 3 7 3</span>
<span class="sd"> 6 3 5 6</span>
<span class="sd"> 9 2 8 6</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;a&#39;)[&#39;b&#39;].tail(2).sort_index()</span>
<span class="sd"> 1 4</span>
<span class="sd"> 3 1</span>
<span class="sd"> 4 9</span>
<span class="sd"> 5 7</span>
<span class="sd"> 6 5</span>
<span class="sd"> 9 8</span>
<span class="sd"> Name: b, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_limit</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="n">asc</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.shift"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.shift.html#pyspark.pandas.groupby.GroupBy.shift">[docs]</a> <span class="k">def</span> <span class="nf">shift</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">fill_value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Shift each group by periods observations.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> periods : integer, default 1</span>
<span class="sd"> number of periods to shift</span>
<span class="sd"> fill_value : optional</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or DataFrame</span>
<span class="sd"> Object shifted within each group.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({</span>
<span class="sd"> ... &#39;a&#39;: [1, 1, 1, 2, 2, 2, 3, 3, 3],</span>
<span class="sd"> ... &#39;b&#39;: [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> a b</span>
<span class="sd"> 0 1 1</span>
<span class="sd"> 1 1 2</span>
<span class="sd"> 2 1 2</span>
<span class="sd"> 3 2 2</span>
<span class="sd"> 4 2 3</span>
<span class="sd"> 5 2 3</span>
<span class="sd"> 6 3 3</span>
<span class="sd"> 7 3 4</span>
<span class="sd"> 8 3 4</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;a&#39;).shift().sort_index() # doctest: +SKIP</span>
<span class="sd"> b</span>
<span class="sd"> 0 NaN</span>
<span class="sd"> 1 1.0</span>
<span class="sd"> 2 2.0</span>
<span class="sd"> 3 NaN</span>
<span class="sd"> 4 2.0</span>
<span class="sd"> 5 3.0</span>
<span class="sd"> 6 NaN</span>
<span class="sd"> 7 3.0</span>
<span class="sd"> 8 4.0</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;a&#39;).shift(periods=-1, fill_value=0).sort_index() # doctest: +SKIP</span>
<span class="sd"> b</span>
<span class="sd"> 0 2</span>
<span class="sd"> 1 2</span>
<span class="sd"> 2 0</span>
<span class="sd"> 3 3</span>
<span class="sd"> 4 3</span>
<span class="sd"> 5 0</span>
<span class="sd"> 6 4</span>
<span class="sd"> 7 4</span>
<span class="sd"> 8 0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_shift</span><span class="p">(</span><span class="n">periods</span><span class="p">,</span> <span class="n">fill_value</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span>
<span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="GroupBy.transform"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.transform.html#pyspark.pandas.groupby.GroupBy.transform">[docs]</a> <span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">],</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Apply function column-by-column to the GroupBy object.</span>
<span class="sd"> The function passed to `transform` must take a Series as its first</span>
<span class="sd"> argument and return a Series. The given function is executed for</span>
<span class="sd"> each series in each grouped data.</span>
<span class="sd"> While `transform` is a very flexible method, its downside is that</span>
<span class="sd"> using it can be quite a bit slower than using more specific methods</span>
<span class="sd"> like `agg` or `transform`. pandas-on-Spark offers a wide range of method that will</span>
<span class="sd"> be much faster than using `transform` for their specific purposes, so try to</span>
<span class="sd"> use them before reaching for `transform`.</span>
<span class="sd"> .. note:: this API executes the function once to infer the type which is</span>
<span class="sd"> potentially expensive, for instance, when the dataset is created after</span>
<span class="sd"> aggregations or sorting.</span>
<span class="sd"> To avoid this, specify return type in ``func``, for instance, as below:</span>
<span class="sd"> &gt;&gt;&gt; def convert_to_string(x) -&gt; ps.Series[str]:</span>
<span class="sd"> ... return x.apply(&quot;a string {}&quot;.format)</span>
<span class="sd"> When the given function has the return type annotated, the original index of the</span>
<span class="sd"> GroupBy object will be lost and a default index will be attached to the result.</span>
<span class="sd"> Please be careful about configuring the default index. See also `Default Index Type</span>
<span class="sd"> &lt;https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type&gt;`_.</span>
<span class="sd"> .. note:: the series within ``func`` is actually a pandas series. Therefore,</span>
<span class="sd"> any pandas API within this function is allowed.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> func : callable</span>
<span class="sd"> A callable that takes a Series as its first argument, and</span>
<span class="sd"> returns a Series.</span>
<span class="sd"> *args</span>
<span class="sd"> Positional arguments to pass to func.</span>
<span class="sd"> **kwargs</span>
<span class="sd"> Keyword arguments to pass to func.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> applied : DataFrame</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> aggregate : Apply aggregate function to the GroupBy object.</span>
<span class="sd"> Series.apply : Apply a function to a Series.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [0, 0, 1],</span>
<span class="sd"> ... &#39;B&#39;: [1, 2, 3],</span>
<span class="sd"> ... &#39;C&#39;: [4, 6, 5]}, columns=[&#39;A&#39;, &#39;B&#39;, &#39;C&#39;])</span>
<span class="sd"> &gt;&gt;&gt; g = df.groupby(&#39;A&#39;)</span>
<span class="sd"> Notice that ``g`` has two groups, ``0`` and ``1``.</span>
<span class="sd"> Calling `transform` in various ways, we can get different grouping results:</span>
<span class="sd"> Below the functions passed to `transform` takes a Series as</span>
<span class="sd"> its argument and returns a Series. `transform` applies the function on each series</span>
<span class="sd"> in each grouped data, and combine them into a new DataFrame:</span>
<span class="sd"> &gt;&gt;&gt; def convert_to_string(x) -&gt; ps.Series[str]:</span>
<span class="sd"> ... return x.apply(&quot;a string {}&quot;.format)</span>
<span class="sd"> &gt;&gt;&gt; g.transform(convert_to_string) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B C</span>
<span class="sd"> 0 a string 1 a string 4</span>
<span class="sd"> 1 a string 2 a string 6</span>
<span class="sd"> 2 a string 3 a string 5</span>
<span class="sd"> &gt;&gt;&gt; def plus_max(x) -&gt; ps.Series[np.int]:</span>
<span class="sd"> ... return x + x.max()</span>
<span class="sd"> &gt;&gt;&gt; g.transform(plus_max) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B C</span>
<span class="sd"> 0 3 10</span>
<span class="sd"> 1 4 12</span>
<span class="sd"> 2 6 10</span>
<span class="sd"> You can omit the type hint and let pandas-on-Spark infer its type.</span>
<span class="sd"> &gt;&gt;&gt; def plus_min(x):</span>
<span class="sd"> ... return x + x.min()</span>
<span class="sd"> &gt;&gt;&gt; g.transform(plus_min) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B C</span>
<span class="sd"> 0 2 8</span>
<span class="sd"> 1 3 10</span>
<span class="sd"> 2 6 10</span>
<span class="sd"> In case of Series, it works as below.</span>
<span class="sd"> &gt;&gt;&gt; df.B.groupby(df.A).transform(plus_max)</span>
<span class="sd"> 0 3</span>
<span class="sd"> 1 4</span>
<span class="sd"> 2 6</span>
<span class="sd"> Name: B, dtype: int64</span>
<span class="sd"> &gt;&gt;&gt; (df * -1).B.groupby(df.A).transform(abs)</span>
<span class="sd"> 0 1</span>
<span class="sd"> 1 2</span>
<span class="sd"> 2 3</span>
<span class="sd"> Name: B, dtype: int64</span>
<span class="sd"> You can also specify extra arguments to pass to the function.</span>
<span class="sd"> &gt;&gt;&gt; def calculation(x, y, z) -&gt; ps.Series[np.int]:</span>
<span class="sd"> ... return x + x.min() + y + z</span>
<span class="sd"> &gt;&gt;&gt; g.transform(calculation, 5, z=20) # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> B C</span>
<span class="sd"> 0 27 33</span>
<span class="sd"> 1 28 35</span>
<span class="sd"> 2 31 35</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">callable</span><span class="p">(</span><span class="n">func</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;</span><span class="si">%s</span><span class="s2"> object is not callable&quot;</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span>
<span class="n">spec</span> <span class="o">=</span> <span class="n">inspect</span><span class="o">.</span><span class="n">getfullargspec</span><span class="p">(</span><span class="n">func</span><span class="p">)</span>
<span class="n">return_sig</span> <span class="o">=</span> <span class="n">spec</span><span class="o">.</span><span class="n">annotations</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;return&quot;</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span>
<span class="n">psdf</span><span class="p">,</span> <span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">groupkey_names</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_prepare_group_map_apply</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">agg_columns</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">pandas_transform</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span>
<span class="k">return</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="n">should_infer_schema</span> <span class="o">=</span> <span class="n">return_sig</span> <span class="ow">is</span> <span class="kc">None</span>
<span class="k">if</span> <span class="n">should_infer_schema</span><span class="p">:</span>
<span class="c1"># Here we execute with the first 1000 to get the return type.</span>
<span class="c1"># If the records were less than 1000, it uses pandas API directly for a shortcut.</span>
<span class="n">log_advice</span><span class="p">(</span>
<span class="s2">&quot;If the type hints is not specified for `grouby.transform`, &quot;</span>
<span class="s2">&quot;it is expensive to infer the data type internally.&quot;</span>
<span class="p">)</span>
<span class="n">limit</span> <span class="o">=</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">&quot;compute.shortcut_limit&quot;</span><span class="p">)</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">limit</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span>
<span class="n">pdf</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="n">psdf_from_pandas</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span>
<span class="n">return_schema</span> <span class="o">=</span> <span class="n">force_decimal_precision_scale</span><span class="p">(</span>
<span class="n">as_nullable_spark_type</span><span class="p">(</span>
<span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="o">*</span><span class="n">HIDDEN_COLUMNS</span><span class="p">)</span><span class="o">.</span><span class="n">schema</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="n">limit</span><span class="p">:</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cleanup_and_return</span><span class="p">(</span><span class="n">psdf_from_pandas</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_group_map_apply</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="n">pandas_transform</span><span class="p">,</span>
<span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">],</span>
<span class="n">return_schema</span><span class="p">,</span>
<span class="n">retain_index</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span>
<span class="c1"># If schema is inferred, we can restore indexes too.</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span>
<span class="n">sdf</span><span class="p">,</span>
<span class="n">index_fields</span><span class="o">=</span><span class="p">[</span>
<span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">nullable</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span>
<span class="p">],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="p">[</span>
<span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">nullable</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span>
<span class="p">],</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">return_type</span> <span class="o">=</span> <span class="n">infer_return_type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">return_type</span><span class="p">,</span> <span class="n">SeriesType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;Expected the return type of this function to be of Series type, &quot;</span>
<span class="s2">&quot;but found type </span><span class="si">{}</span><span class="s2">&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">return_type</span><span class="p">)</span>
<span class="p">)</span>
<span class="n">dtype</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">dtype</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">spark_type</span>
<span class="n">data_fields</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">InternalField</span><span class="p">(</span><span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">struct_field</span><span class="o">=</span><span class="n">StructField</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">c</span><span class="p">,</span> <span class="n">dataType</span><span class="o">=</span><span class="n">spark_type</span><span class="p">))</span>
<span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span>
<span class="k">if</span> <span class="n">c</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">groupkey_names</span>
<span class="p">]</span>
<span class="n">return_schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span><span class="n">field</span><span class="o">.</span><span class="n">struct_field</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">data_fields</span><span class="p">])</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_group_map_apply</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="n">pandas_transform</span><span class="p">,</span>
<span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">],</span>
<span class="n">return_schema</span><span class="p">,</span>
<span class="n">retain_index</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="p">)</span>
<span class="c1"># Otherwise, it loses index.</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span>
<span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cleanup_and_return</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div>
<div class="viewcode-block" id="GroupBy.nunique"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.nunique.html#pyspark.pandas.groupby.GroupBy.nunique">[docs]</a> <span class="k">def</span> <span class="nf">nunique</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return DataFrame with number of distinct observations per group for each column.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> dropna : boolean, default True</span>
<span class="sd"> Don’t include NaN in the counts.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> nunique : DataFrame or Series</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;id&#39;: [&#39;spam&#39;, &#39;egg&#39;, &#39;egg&#39;, &#39;spam&#39;,</span>
<span class="sd"> ... &#39;ham&#39;, &#39;ham&#39;],</span>
<span class="sd"> ... &#39;value1&#39;: [1, 5, 5, 2, 5, 5],</span>
<span class="sd"> ... &#39;value2&#39;: list(&#39;abbaxy&#39;)}, columns=[&#39;id&#39;, &#39;value1&#39;, &#39;value2&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> id value1 value2</span>
<span class="sd"> 0 spam 1 a</span>
<span class="sd"> 1 egg 5 b</span>
<span class="sd"> 2 egg 5 b</span>
<span class="sd"> 3 spam 2 a</span>
<span class="sd"> 4 ham 5 x</span>
<span class="sd"> 5 ham 5 y</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;id&#39;).nunique().sort_index() # doctest: +SKIP</span>
<span class="sd"> value1 value2</span>
<span class="sd"> id</span>
<span class="sd"> egg 1 1</span>
<span class="sd"> ham 1 2</span>
<span class="sd"> spam 2 1</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;id&#39;)[&#39;value1&#39;].nunique().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> id</span>
<span class="sd"> egg 1</span>
<span class="sd"> ham 1</span>
<span class="sd"> spam 2</span>
<span class="sd"> Name: value1, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">dropna</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">stat_function</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">countDistinct</span><span class="p">(</span><span class="n">col</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">stat_function</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">countDistinct</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="o">+</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span>
<span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="kc">None</span><span class="p">))</span> <span class="o">&gt;=</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span>
<span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">stat_function</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">rolling</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">window</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">min_periods</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RollingGroupby[FrameLike]&quot;</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return an rolling grouper, providing rolling</span>
<span class="sd"> functionality per group.</span>
<span class="sd"> .. note:: &#39;min_periods&#39; in pandas-on-Spark works as a fixed window size unlike pandas.</span>
<span class="sd"> Unlike pandas, NA is also counted as the period. This might be changed</span>
<span class="sd"> in the near future.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> window : int, or offset</span>
<span class="sd"> Size of the moving window.</span>
<span class="sd"> This is the number of observations used for calculating the statistic.</span>
<span class="sd"> Each window will be a fixed size.</span>
<span class="sd"> min_periods : int, default 1</span>
<span class="sd"> Minimum number of observations in window required to have a value</span>
<span class="sd"> (otherwise result is NA).</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.groupby</span>
<span class="sd"> DataFrame.groupby</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">RollingGroupby</span>
<span class="k">return</span> <span class="n">RollingGroupby</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">window</span><span class="p">,</span> <span class="n">min_periods</span><span class="o">=</span><span class="n">min_periods</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">expanding</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">min_periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;ExpandingGroupby[FrameLike]&quot;</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return an expanding grouper, providing expanding</span>
<span class="sd"> functionality per group.</span>
<span class="sd"> .. note:: &#39;min_periods&#39; in pandas-on-Spark works as a fixed window size unlike pandas.</span>
<span class="sd"> Unlike pandas, NA is also counted as the period. This might be changed</span>
<span class="sd"> in the near future.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> min_periods : int, default 1</span>
<span class="sd"> Minimum number of observations in window required to have a value</span>
<span class="sd"> (otherwise result is NA).</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> Series.groupby</span>
<span class="sd"> DataFrame.groupby</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">ExpandingGroupby</span>
<span class="k">return</span> <span class="n">ExpandingGroupby</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">min_periods</span><span class="o">=</span><span class="n">min_periods</span><span class="p">)</span>
<div class="viewcode-block" id="GroupBy.get_group"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.get_group.html#pyspark.pandas.groupby.GroupBy.get_group">[docs]</a> <span class="k">def</span> <span class="nf">get_group</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Construct DataFrame from group with provided name.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> name : object</span>
<span class="sd"> The name of the group to get as a DataFrame.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> group : same type as obj</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame([(&#39;falcon&#39;, &#39;bird&#39;, 389.0),</span>
<span class="sd"> ... (&#39;parrot&#39;, &#39;bird&#39;, 24.0),</span>
<span class="sd"> ... (&#39;lion&#39;, &#39;mammal&#39;, 80.5),</span>
<span class="sd"> ... (&#39;monkey&#39;, &#39;mammal&#39;, np.nan)],</span>
<span class="sd"> ... columns=[&#39;name&#39;, &#39;class&#39;, &#39;max_speed&#39;],</span>
<span class="sd"> ... index=[0, 2, 3, 1])</span>
<span class="sd"> &gt;&gt;&gt; psdf</span>
<span class="sd"> name class max_speed</span>
<span class="sd"> 0 falcon bird 389.0</span>
<span class="sd"> 2 parrot bird 24.0</span>
<span class="sd"> 3 lion mammal 80.5</span>
<span class="sd"> 1 monkey mammal NaN</span>
<span class="sd"> &gt;&gt;&gt; psdf.groupby(&quot;class&quot;).get_group(&quot;bird&quot;).sort_index()</span>
<span class="sd"> name class max_speed</span>
<span class="sd"> 0 falcon bird 389.0</span>
<span class="sd"> 2 parrot bird 24.0</span>
<span class="sd"> &gt;&gt;&gt; psdf.groupby(&quot;class&quot;).get_group(&quot;mammal&quot;).sort_index()</span>
<span class="sd"> name class max_speed</span>
<span class="sd"> 1 monkey mammal NaN</span>
<span class="sd"> 3 lion mammal 80.5</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">groupkeys</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_hashable</span><span class="p">(</span><span class="n">name</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;unhashable type: &#39;</span><span class="si">{}</span><span class="s2">&#39;&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">name</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span>
<span class="k">elif</span> <span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;must supply a tuple to get_group with multiple grouping keys&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">name</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;must supply a same-length tuple to get_group with multiple grouping keys&quot;</span>
<span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">name</span><span class="p">):</span>
<span class="n">name</span> <span class="o">=</span> <span class="p">[</span><span class="n">name</span><span class="p">]</span>
<span class="n">cond</span> <span class="o">=</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">True</span><span class="p">)</span>
<span class="k">for</span> <span class="n">groupkey</span><span class="p">,</span> <span class="n">item</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">name</span><span class="p">):</span>
<span class="n">scol</span> <span class="o">=</span> <span class="n">groupkey</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span>
<span class="n">cond</span> <span class="o">=</span> <span class="n">cond</span> <span class="o">&amp;</span> <span class="p">(</span><span class="n">scol</span> <span class="o">==</span> <span class="n">item</span><span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span><span class="p">:</span>
<span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span>
<span class="n">spark_frame</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="n">internal</span><span class="o">.</span><span class="n">index_spark_columns</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_scols</span>
<span class="p">)</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">spark_frame</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">spark_frame</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_column_names</span>
<span class="p">],</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">spark_frame</span><span class="p">,</span> <span class="n">s</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span>
<span class="p">],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">],</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_filter</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span>
<span class="k">if</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">head</span><span class="p">()</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">name</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cleanup_and_return</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div>
<div class="viewcode-block" id="GroupBy.median"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.median.html#pyspark.pandas.groupby.GroupBy.median">[docs]</a> <span class="k">def</span> <span class="nf">median</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10000</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute median of groups, excluding missing values.</span>
<span class="sd"> For multiple groupings, the result index will be a MultiIndex</span>
<span class="sd"> .. note:: Unlike pandas&#39;, the median in pandas-on-Spark is an approximated median based upon</span>
<span class="sd"> approximate percentile computation because computing median across a large dataset</span>
<span class="sd"> is extremely expensive.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> numeric_only : bool, default True</span>
<span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span>
<span class="sd"> is mainly for pandas compatibility.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Series or DataFrame</span>
<span class="sd"> Median of values within each group.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; psdf = ps.DataFrame({&#39;a&#39;: [1., 1., 1., 1., 2., 2., 2., 3., 3., 3.],</span>
<span class="sd"> ... &#39;b&#39;: [2., 3., 1., 4., 6., 9., 8., 10., 7., 5.],</span>
<span class="sd"> ... &#39;c&#39;: [3., 5., 2., 5., 1., 2., 6., 4., 3., 6.]},</span>
<span class="sd"> ... columns=[&#39;a&#39;, &#39;b&#39;, &#39;c&#39;],</span>
<span class="sd"> ... index=[7, 2, 4, 1, 3, 4, 9, 10, 5, 6])</span>
<span class="sd"> &gt;&gt;&gt; psdf</span>
<span class="sd"> a b c</span>
<span class="sd"> 7 1.0 2.0 3.0</span>
<span class="sd"> 2 1.0 3.0 5.0</span>
<span class="sd"> 4 1.0 1.0 2.0</span>
<span class="sd"> 1 1.0 4.0 5.0</span>
<span class="sd"> 3 2.0 6.0 1.0</span>
<span class="sd"> 4 2.0 9.0 2.0</span>
<span class="sd"> 9 2.0 8.0 6.0</span>
<span class="sd"> 10 3.0 10.0 4.0</span>
<span class="sd"> 5 3.0 7.0 3.0</span>
<span class="sd"> 6 3.0 5.0 6.0</span>
<span class="sd"> DataFrameGroupBy</span>
<span class="sd"> &gt;&gt;&gt; psdf.groupby(&#39;a&#39;).median().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> b c</span>
<span class="sd"> a</span>
<span class="sd"> 1.0 2.0 3.0</span>
<span class="sd"> 2.0 8.0 2.0</span>
<span class="sd"> 3.0 7.0 4.0</span>
<span class="sd"> SeriesGroupBy</span>
<span class="sd"> &gt;&gt;&gt; psdf.groupby(&#39;a&#39;)[&#39;b&#39;].median().sort_index()</span>
<span class="sd"> a</span>
<span class="sd"> 1.0 2.0</span>
<span class="sd"> 2.0 8.0</span>
<span class="sd"> 3.0 7.0</span>
<span class="sd"> Name: b, dtype: float64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">accuracy</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span>
<span class="s2">&quot;accuracy must be an integer; however, got [</span><span class="si">%s</span><span class="s2">]&quot;</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">accuracy</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">stat_function</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">percentile_approx</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="mf">0.5</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">stat_function</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_reduce_for_stat_function</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">sfun</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span> <span class="n">only_numeric</span><span class="p">:</span> <span class="nb">bool</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">FrameLike</span><span class="p">:</span>
<span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span>
<span class="n">groupkey_scols</span> <span class="o">=</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)]</span>
<span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">psser</span>
<span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">)</span> <span class="ow">or</span> <span class="ow">not</span> <span class="n">only_numeric</span>
<span class="p">]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="o">*</span><span class="n">groupkey_scols</span><span class="p">,</span> <span class="o">*</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">]</span>
<span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span>
<span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">],</span>
<span class="n">index_fields</span><span class="o">=</span><span class="p">[</span>
<span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span>
<span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span>
<span class="p">],</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">],</span>
<span class="n">column_label_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">stat_exprs</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span>
<span class="n">psser</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="n">stat_exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span>
<span class="n">sfun</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">nan_to_null</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span>
<span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">stat_exprs</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">distinct</span><span class="p">()</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span><span class="p">:</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span>
<span class="n">subset</span><span class="o">=</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span><span class="p">:</span>
<span class="n">should_drop_index</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span>
<span class="n">i</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">gkey</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span> <span class="k">if</span> <span class="n">gkey</span><span class="o">.</span><span class="n">_psdf</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">should_drop_index</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">level</span><span class="o">=</span><span class="n">should_drop_index</span><span class="p">,</span> <span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">should_drop_index</span><span class="p">)</span> <span class="o">&lt;</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">):</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cleanup_and_return</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_resolve_grouping_from_diff_dataframes</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">Label</span><span class="p">]]</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span> <span class="n">Set</span><span class="p">[</span><span class="n">Label</span><span class="p">]]:</span>
<span class="n">column_labels_level</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">additional_pssers</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">additional_column_labels</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">tmp_column_labels</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">col_or_s</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">by</span><span class="p">):</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="k">if</span> <span class="n">col_or_s</span><span class="o">.</span><span class="n">_psdf</span> <span class="ow">is</span> <span class="n">psdf</span><span class="p">:</span>
<span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col_or_s</span><span class="o">.</span><span class="n">_column_label</span><span class="p">)</span>
<span class="k">elif</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="n">psdf</span><span class="p">):</span>
<span class="n">temp_label</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="s2">&quot;__tmp_groupkey_</span><span class="si">{}</span><span class="s2">__&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">))</span>
<span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">temp_label</span><span class="p">)</span>
<span class="n">additional_pssers</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col_or_s</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">temp_label</span><span class="p">))</span>
<span class="n">additional_column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">temp_label</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">temp_label</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="nb">tuple</span><span class="p">(</span>
<span class="p">([</span><span class="s2">&quot;&quot;</span><span class="p">]</span> <span class="o">*</span> <span class="p">(</span><span class="n">column_labels_level</span> <span class="o">-</span> <span class="mi">1</span><span class="p">))</span> <span class="o">+</span> <span class="p">[</span><span class="s2">&quot;__tmp_groupkey_</span><span class="si">{}</span><span class="s2">__&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">)]</span>
<span class="p">),</span>
<span class="p">)</span>
<span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">temp_label</span><span class="p">)</span>
<span class="n">tmp_column_labels</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">temp_label</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span>
<span class="n">psser</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">col_or_s</span><span class="p">]</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">))</span>
<span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span>
<span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">]</span>
<span class="o">+</span> <span class="n">additional_pssers</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">assign_columns</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">this_column_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span> <span class="n">that_column_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">Label</span><span class="p">]]:</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span>
<span class="s2">&quot;Duplicated labels with groupby() and &quot;</span>
<span class="s2">&quot;&#39;compute.ops_on_diff_frames&#39; option are not supported currently &quot;</span>
<span class="s2">&quot;Please use unique labels in series and frames.&quot;</span>
<span class="p">)</span>
<span class="k">for</span> <span class="n">col_or_s</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">by</span><span class="p">,</span> <span class="n">column_labels</span><span class="p">):</span>
<span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">tmp_column_labels</span><span class="p">:</span>
<span class="n">psser</span> <span class="o">=</span> <span class="n">col_or_s</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="n">align_diff_frames</span><span class="p">(</span>
<span class="n">assign_columns</span><span class="p">,</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="n">psser</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">label</span><span class="p">),</span>
<span class="n">fillna</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span>
<span class="n">how</span><span class="o">=</span><span class="s2">&quot;inner&quot;</span><span class="p">,</span>
<span class="n">preserve_order_column</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">tmp_column_labels</span> <span class="o">|=</span> <span class="nb">set</span><span class="p">(</span><span class="n">additional_column_labels</span><span class="p">)</span>
<span class="n">new_by_series</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">col_or_s</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">by</span><span class="p">,</span> <span class="n">column_labels</span><span class="p">):</span>
<span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">tmp_column_labels</span><span class="p">:</span>
<span class="n">psser</span> <span class="o">=</span> <span class="n">col_or_s</span>
<span class="n">new_by_series</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">name</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">new_by_series</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">))</span>
<span class="k">return</span> <span class="n">psdf</span><span class="p">,</span> <span class="n">new_by_series</span><span class="p">,</span> <span class="n">tmp_column_labels</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_resolve_grouping</span><span class="p">(</span><span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">Label</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">]:</span>
<span class="n">new_by_series</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">col_or_s</span> <span class="ow">in</span> <span class="n">by</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="n">new_by_series</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">)</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span>
<span class="n">psser</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">col_or_s</span><span class="p">]</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">))</span>
<span class="n">new_by_series</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">)</span>
<span class="k">return</span> <span class="n">new_by_series</span>
<span class="k">class</span> <span class="nc">DataFrameGroupBy</span><span class="p">(</span><span class="n">GroupBy</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">]):</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_build</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">Label</span><span class="p">]],</span> <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrameGroupBy&quot;</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">col_or_s</span><span class="p">)</span> <span class="k">for</span> <span class="n">col_or_s</span> <span class="ow">in</span> <span class="n">by</span><span class="p">):</span>
<span class="p">(</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="n">new_by_series</span><span class="p">,</span>
<span class="n">column_labels_to_exclude</span><span class="p">,</span>
<span class="p">)</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_resolve_grouping_from_diff_dataframes</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">by</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">new_by_series</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_resolve_grouping</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">by</span><span class="p">)</span>
<span class="n">column_labels_to_exclude</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span>
<span class="k">return</span> <span class="n">DataFrameGroupBy</span><span class="p">(</span>
<span class="n">psdf</span><span class="p">,</span>
<span class="n">new_by_series</span><span class="p">,</span>
<span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span>
<span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">,</span>
<span class="n">column_labels_to_exclude</span><span class="o">=</span><span class="n">column_labels_to_exclude</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span>
<span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span>
<span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span>
<span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span>
<span class="n">column_labels_to_exclude</span><span class="p">:</span> <span class="n">Set</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span>
<span class="n">agg_columns</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">):</span>
<span class="n">agg_columns_selected</span> <span class="o">=</span> <span class="n">agg_columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="k">if</span> <span class="n">agg_columns_selected</span><span class="p">:</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">:</span>
<span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels_to_exclude</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">label</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">label</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">any</span><span class="p">(</span><span class="n">label</span> <span class="o">==</span> <span class="n">key</span><span class="o">.</span><span class="n">_column_label</span> <span class="ow">and</span> <span class="n">key</span><span class="o">.</span><span class="n">_psdf</span> <span class="ow">is</span> <span class="n">psdf</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">by</span><span class="p">)</span>
<span class="ow">and</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">column_labels_to_exclude</span>
<span class="p">]</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">=</span><span class="n">psdf</span><span class="p">,</span>
<span class="n">groupkeys</span><span class="o">=</span><span class="n">by</span><span class="p">,</span>
<span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span>
<span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">,</span>
<span class="n">column_labels_to_exclude</span><span class="o">=</span><span class="n">column_labels_to_exclude</span><span class="p">,</span>
<span class="n">agg_columns_selected</span><span class="o">=</span><span class="n">agg_columns_selected</span><span class="p">,</span>
<span class="n">agg_columns</span><span class="o">=</span><span class="p">[</span><span class="n">psdf</span><span class="p">[</span><span class="n">label</span><span class="p">]</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">],</span>
<span class="p">)</span>
<span class="k">def</span> <span class="fm">__getattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Any</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">MissingPandasLikeDataFrameGroupBy</span><span class="p">,</span> <span class="n">item</span><span class="p">):</span>
<span class="n">property_or_func</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">MissingPandasLikeDataFrameGroupBy</span><span class="p">,</span> <span class="n">item</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">property_or_func</span><span class="p">,</span> <span class="nb">property</span><span class="p">):</span>
<span class="k">return</span> <span class="n">property_or_func</span><span class="o">.</span><span class="n">fget</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">partial</span><span class="p">(</span><span class="n">property_or_func</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="fm">__getitem__</span><span class="p">(</span><span class="n">item</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">GroupBy</span><span class="p">:</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span> <span class="ow">and</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">item</span><span class="p">):</span>
<span class="k">return</span> <span class="n">SeriesGroupBy</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">item</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">item</span><span class="p">,)),</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span>
<span class="n">dropna</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">item</span><span class="p">):</span>
<span class="n">item</span> <span class="o">=</span> <span class="p">[</span><span class="n">item</span><span class="p">]</span>
<span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">item</span><span class="p">):</span>
<span class="n">item</span> <span class="o">=</span> <span class="p">[(</span><span class="n">item</span><span class="p">,)]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">item</span> <span class="o">=</span> <span class="p">[</span><span class="n">i</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">i</span><span class="p">,)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">item</span><span class="p">]</span>
<span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span><span class="p">:</span>
<span class="n">groupkey_names</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">key</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span>
<span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">item</span><span class="p">:</span>
<span class="k">if</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span>
<span class="s2">&quot;cannot insert </span><span class="si">{}</span><span class="s2">, already exists&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">name</span><span class="p">))</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">DataFrameGroupBy</span><span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span>
<span class="n">as_index</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span><span class="p">,</span>
<span class="n">dropna</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span><span class="p">,</span>
<span class="n">column_labels_to_exclude</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_column_labels_to_exclude</span><span class="p">,</span>
<span class="n">agg_columns</span><span class="o">=</span><span class="n">item</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">_apply_series_op</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">op</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">&quot;SeriesGroupBy&quot;</span><span class="p">],</span> <span class="n">Series</span><span class="p">],</span>
<span class="n">should_resolve</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="n">applied</span> <span class="o">=</span> <span class="p">[]</span>
<span class="k">for</span> <span class="n">column</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">:</span>
<span class="n">applied</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">(</span><span class="n">column</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span>
<span class="k">if</span> <span class="n">numeric_only</span><span class="p">:</span>
<span class="n">applied</span> <span class="o">=</span> <span class="p">[</span><span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">applied</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">)]</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">applied</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">DataError</span><span class="p">(</span><span class="s2">&quot;No numeric types to aggregate&quot;</span><span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span><span class="n">applied</span><span class="p">,</span> <span class="n">keep_order</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span>
<span class="k">if</span> <span class="n">should_resolve</span><span class="p">:</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">resolved_copy</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_cleanup_and_return</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="k">return</span> <span class="n">psdf</span>
<span class="c1"># TODO: Implement &#39;percentiles&#39;, &#39;include&#39;, and &#39;exclude&#39; arguments.</span>
<span class="c1"># TODO: Add ``DataFrame.select_dtypes`` to See Also when &#39;include&#39;</span>
<span class="c1"># and &#39;exclude&#39; arguments are implemented.</span>
<div class="viewcode-block" id="DataFrameGroupBy.describe"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.DataFrameGroupBy.describe.html#pyspark.pandas.groupby.DataFrameGroupBy.describe">[docs]</a> <span class="k">def</span> <span class="nf">describe</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Generate descriptive statistics that summarize the central tendency,</span>
<span class="sd"> dispersion and shape of a dataset&#39;s distribution, excluding</span>
<span class="sd"> ``NaN`` values.</span>
<span class="sd"> Analyzes both numeric and object series, as well</span>
<span class="sd"> as ``DataFrame`` column sets of mixed data types. The output</span>
<span class="sd"> will vary depending on what is provided. Refer to the notes</span>
<span class="sd"> below for more detail.</span>
<span class="sd"> .. note:: Unlike pandas, the percentiles in pandas-on-Spark are based upon</span>
<span class="sd"> approximate percentile computation because computing percentiles</span>
<span class="sd"> across a large dataset is extremely expensive.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> DataFrame</span>
<span class="sd"> Summary statistics of the DataFrame provided.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.count</span>
<span class="sd"> DataFrame.max</span>
<span class="sd"> DataFrame.min</span>
<span class="sd"> DataFrame.mean</span>
<span class="sd"> DataFrame.std</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 1, 3], &#39;b&#39;: [4, 5, 6], &#39;c&#39;: [7, 8, 9]})</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> a b c</span>
<span class="sd"> 0 1 4 7</span>
<span class="sd"> 1 1 5 8</span>
<span class="sd"> 2 3 6 9</span>
<span class="sd"> Describing a ``DataFrame``. By default only numeric fields</span>
<span class="sd"> are returned.</span>
<span class="sd"> &gt;&gt;&gt; described = df.groupby(&#39;a&#39;).describe()</span>
<span class="sd"> &gt;&gt;&gt; described.sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> b c</span>
<span class="sd"> count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max</span>
<span class="sd"> a</span>
<span class="sd"> 1 2.0 4.5 0.707107 4.0 4.0 4.0 5.0 5.0 2.0 7.5 0.707107 7.0 7.0 7.0 8.0 8.0</span>
<span class="sd"> 3 1.0 6.0 NaN 6.0 6.0 6.0 6.0 6.0 1.0 9.0 NaN 9.0 9.0 9.0 9.0 9.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">StringType</span><span class="p">):</span>
<span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span>
<span class="s2">&quot;DataFrameGroupBy.describe() doesn&#39;t support for string type for now&quot;</span>
<span class="p">)</span>
<span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">aggregate</span><span class="p">([</span><span class="s2">&quot;count&quot;</span><span class="p">,</span> <span class="s2">&quot;mean&quot;</span><span class="p">,</span> <span class="s2">&quot;std&quot;</span><span class="p">,</span> <span class="s2">&quot;min&quot;</span><span class="p">,</span> <span class="s2">&quot;quartiles&quot;</span><span class="p">,</span> <span class="s2">&quot;max&quot;</span><span class="p">])</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span>
<span class="n">agg_column_labels</span> <span class="o">=</span> <span class="p">[</span><span class="n">col</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">]</span>
<span class="n">formatted_percentiles</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;25%&quot;</span><span class="p">,</span> <span class="s2">&quot;50%&quot;</span><span class="p">,</span> <span class="s2">&quot;75%&quot;</span><span class="p">]</span>
<span class="c1"># Split &quot;quartiles&quot; columns into first, second, and third quartiles.</span>
<span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">agg_column_labels</span><span class="p">:</span>
<span class="n">quartiles_col</span> <span class="o">=</span> <span class="n">name_like_string</span><span class="p">(</span><span class="nb">tuple</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">+</span> <span class="p">[</span><span class="s2">&quot;quartiles&quot;</span><span class="p">]))</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">percentile</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">formatted_percentiles</span><span class="p">):</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span>
<span class="n">name_like_string</span><span class="p">(</span><span class="nb">tuple</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">+</span> <span class="p">[</span><span class="n">percentile</span><span class="p">])),</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">quartiles_col</span><span class="p">)[</span><span class="n">i</span><span class="p">],</span>
<span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">quartiles_col</span><span class="p">)</span>
<span class="c1"># Reorder columns lexicographically by agg column followed by stats.</span>
<span class="n">stats</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;count&quot;</span><span class="p">,</span> <span class="s2">&quot;mean&quot;</span><span class="p">,</span> <span class="s2">&quot;std&quot;</span><span class="p">,</span> <span class="s2">&quot;min&quot;</span><span class="p">]</span> <span class="o">+</span> <span class="n">formatted_percentiles</span> <span class="o">+</span> <span class="p">[</span><span class="s2">&quot;max&quot;</span><span class="p">]</span>
<span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span><span class="nb">tuple</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">+</span> <span class="p">[</span><span class="n">s</span><span class="p">])</span> <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">product</span><span class="p">(</span><span class="n">agg_column_labels</span><span class="p">,</span> <span class="n">stats</span><span class="p">)]</span>
<span class="n">data_columns</span> <span class="o">=</span> <span class="nb">map</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">,</span> <span class="n">column_labels</span><span class="p">)</span>
<span class="c1"># Reindex the DataFrame to reflect initial grouping and agg columns.</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">,</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_columns</span><span class="p">],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span>
<span class="p">)</span>
<span class="c1"># Cast columns to ``&quot;float64&quot;`` to match `pandas.DataFrame.groupby`.</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s2">&quot;float64&quot;</span><span class="p">)</span></div>
<span class="k">class</span> <span class="nc">SeriesGroupBy</span><span class="p">(</span><span class="n">GroupBy</span><span class="p">[</span><span class="n">Series</span><span class="p">]):</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_build</span><span class="p">(</span>
<span class="n">psser</span><span class="p">:</span> <span class="n">Series</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">Label</span><span class="p">]],</span> <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SeriesGroupBy&quot;</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">any</span><span class="p">(</span>
<span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">psser</span><span class="p">,</span> <span class="n">col_or_s</span><span class="p">)</span> <span class="k">for</span> <span class="n">col_or_s</span> <span class="ow">in</span> <span class="n">by</span>
<span class="p">):</span>
<span class="n">psdf</span><span class="p">,</span> <span class="n">new_by_series</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_resolve_grouping_from_diff_dataframes</span><span class="p">(</span>
<span class="n">psser</span><span class="o">.</span><span class="n">to_frame</span><span class="p">(),</span> <span class="n">by</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">SeriesGroupBy</span><span class="p">(</span>
<span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">name</span><span class="p">),</span>
<span class="n">new_by_series</span><span class="p">,</span>
<span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span>
<span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">new_by_series</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_resolve_grouping</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span> <span class="n">by</span><span class="p">)</span>
<span class="k">return</span> <span class="n">SeriesGroupBy</span><span class="p">(</span><span class="n">psser</span><span class="p">,</span> <span class="n">new_by_series</span><span class="p">,</span> <span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span> <span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">)</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">psser</span><span class="p">:</span> <span class="n">Series</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span> <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">):</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">as_index</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">&quot;as_index=False only valid with DataFrame&quot;</span><span class="p">)</span>
<span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span>
<span class="n">psdf</span><span class="o">=</span><span class="n">psser</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span>
<span class="n">groupkeys</span><span class="o">=</span><span class="n">by</span><span class="p">,</span>
<span class="n">as_index</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">,</span>
<span class="n">column_labels_to_exclude</span><span class="o">=</span><span class="nb">set</span><span class="p">(),</span>
<span class="n">agg_columns_selected</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span>
<span class="n">agg_columns</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="p">],</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_psser</span> <span class="o">=</span> <span class="n">psser</span>
<span class="k">def</span> <span class="fm">__getattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Any</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">MissingPandasLikeSeriesGroupBy</span><span class="p">,</span> <span class="n">item</span><span class="p">):</span>
<span class="n">property_or_func</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">MissingPandasLikeSeriesGroupBy</span><span class="p">,</span> <span class="n">item</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">property_or_func</span><span class="p">,</span> <span class="nb">property</span><span class="p">):</span>
<span class="k">return</span> <span class="n">property_or_func</span><span class="o">.</span><span class="n">fget</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">partial</span><span class="p">(</span><span class="n">property_or_func</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span>
<span class="k">raise</span> <span class="ne">AttributeError</span><span class="p">(</span><span class="n">item</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">_apply_series_op</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">op</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">&quot;SeriesGroupBy&quot;</span><span class="p">],</span> <span class="n">Series</span><span class="p">],</span>
<span class="n">should_resolve</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Series</span><span class="p">:</span>
<span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span>
<span class="k">raise</span> <span class="n">DataError</span><span class="p">(</span><span class="s2">&quot;No numeric types to aggregate&quot;</span><span class="p">)</span>
<span class="n">psser</span> <span class="o">=</span> <span class="n">op</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="k">if</span> <span class="n">should_resolve</span><span class="p">:</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span>
<span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">psser</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">_cleanup_and_return</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Series</span><span class="p">:</span>
<span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">()</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">agg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">MissingPandasLikeSeriesGroupBy</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">aggregate</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="n">MissingPandasLikeSeriesGroupBy</span><span class="o">.</span><span class="n">aggregate</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">size</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Series</span><span class="p">:</span>
<span class="k">return</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">size</span><span class="p">()</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span>
<span class="n">size</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">size</span><span class="o">.</span><span class="vm">__doc__</span>
<span class="c1"># TODO: add keep parameter</span>
<div class="viewcode-block" id="SeriesGroupBy.nsmallest"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.SeriesGroupBy.nsmallest.html#pyspark.pandas.groupby.SeriesGroupBy.nsmallest">[docs]</a> <span class="k">def</span> <span class="nf">nsmallest</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Series</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the smallest `n` elements.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> n : int</span>
<span class="sd"> Number of items to retrieve.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.nsmallest</span>
<span class="sd"> pyspark.pandas.DataFrame.nsmallest</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 1, 1, 2, 2, 2, 3, 3, 3],</span>
<span class="sd"> ... &#39;b&#39;: [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;a&#39;])[&#39;b&#39;].nsmallest(1).sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> a</span>
<span class="sd"> 1 0 1</span>
<span class="sd"> 2 3 2</span>
<span class="sd"> 3 6 3</span>
<span class="sd"> Name: b, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">&gt;</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;nsmallest do not support multi-index now&quot;</span><span class="p">)</span>
<span class="n">groupkey_col_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="o">*</span><span class="p">[</span><span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">scol</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">,</span> <span class="n">groupkey_col_names</span><span class="p">)],</span>
<span class="o">*</span><span class="p">[</span>
<span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">scol</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">)</span>
<span class="p">],</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span>
<span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_col_names</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">.</span><span class="n">asc</span><span class="p">(),</span>
<span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">temp_rank_column</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">&quot;__rank__&quot;</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">))</span>
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="n">n</span><span class="p">)</span>
<span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">)</span>
<span class="p">)</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">(</span>
<span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_col_names</span><span class="p">]</span>
<span class="o">+</span> <span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">)</span>
<span class="p">]</span>
<span class="p">),</span>
<span class="n">index_names</span><span class="o">=</span><span class="p">(</span>
<span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">]</span>
<span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span>
<span class="p">),</span>
<span class="n">index_fields</span><span class="o">=</span><span class="p">(</span>
<span class="p">[</span>
<span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_col_names</span><span class="p">)</span>
<span class="p">]</span>
<span class="o">+</span> <span class="p">[</span>
<span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">field</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">)</span>
<span class="p">]</span>
<span class="p">),</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_column_label</span><span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]],</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div>
<span class="c1"># TODO: add keep parameter</span>
<div class="viewcode-block" id="SeriesGroupBy.nlargest"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.SeriesGroupBy.nlargest.html#pyspark.pandas.groupby.SeriesGroupBy.nlargest">[docs]</a> <span class="k">def</span> <span class="nf">nlargest</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Series</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return the first n rows ordered by columns in descending order in group.</span>
<span class="sd"> Return the first n rows with the smallest values in columns, in descending order.</span>
<span class="sd"> The columns that are not specified are returned as well, but not used for ordering.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> n : int</span>
<span class="sd"> Number of items to retrieve.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.nlargest</span>
<span class="sd"> pyspark.pandas.DataFrame.nlargest</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 1, 1, 2, 2, 2, 3, 3, 3],</span>
<span class="sd"> ... &#39;b&#39;: [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;a&#39;])[&#39;b&#39;].nlargest(1).sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> a</span>
<span class="sd"> 1 1 2</span>
<span class="sd"> 2 4 3</span>
<span class="sd"> 3 7 4</span>
<span class="sd"> Name: b, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">&gt;</span> <span class="mi">1</span><span class="p">:</span>
<span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">&quot;nlargest do not support multi-index now&quot;</span><span class="p">)</span>
<span class="n">groupkey_col_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span>
<span class="o">*</span><span class="p">[</span><span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">scol</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">,</span> <span class="n">groupkey_col_names</span><span class="p">)],</span>
<span class="o">*</span><span class="p">[</span>
<span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">scol</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">)</span>
<span class="p">],</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span>
<span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_col_names</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">.</span><span class="n">desc</span><span class="p">(),</span>
<span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">temp_rank_column</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">&quot;__rank__&quot;</span><span class="p">)</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">))</span>
<span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">)</span> <span class="o">&lt;=</span> <span class="n">n</span><span class="p">)</span>
<span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">)</span>
<span class="p">)</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">(</span>
<span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_col_names</span><span class="p">]</span>
<span class="o">+</span> <span class="p">[</span>
<span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span>
<span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">)</span>
<span class="p">]</span>
<span class="p">),</span>
<span class="n">index_names</span><span class="o">=</span><span class="p">(</span>
<span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">]</span>
<span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span>
<span class="p">),</span>
<span class="n">index_fields</span><span class="o">=</span><span class="p">(</span>
<span class="p">[</span>
<span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_col_names</span><span class="p">)</span>
<span class="p">]</span>
<span class="o">+</span> <span class="p">[</span>
<span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">field</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">)</span>
<span class="p">]</span>
<span class="p">),</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_column_label</span><span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])],</span>
<span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]],</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div>
<span class="c1"># TODO: add bins, normalize parameter</span>
<div class="viewcode-block" id="SeriesGroupBy.value_counts"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.SeriesGroupBy.value_counts.html#pyspark.pandas.groupby.SeriesGroupBy.value_counts">[docs]</a> <span class="k">def</span> <span class="nf">value_counts</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">sort</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">ascending</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Series</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Compute group sizes.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> sort : boolean, default None</span>
<span class="sd"> Sort by frequencies.</span>
<span class="sd"> ascending : boolean, default False</span>
<span class="sd"> Sort in ascending order.</span>
<span class="sd"> dropna : boolean, default True</span>
<span class="sd"> Don&#39;t include counts of NaN.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.groupby</span>
<span class="sd"> pyspark.pandas.DataFrame.groupby</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;A&#39;: [1, 2, 2, 3, 3, 3],</span>
<span class="sd"> ... &#39;B&#39;: [1, 1, 2, 3, 3, np.nan]},</span>
<span class="sd"> ... columns=[&#39;A&#39;, &#39;B&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df</span>
<span class="sd"> A B</span>
<span class="sd"> 0 1 1.0</span>
<span class="sd"> 1 2 1.0</span>
<span class="sd"> 2 2 2.0</span>
<span class="sd"> 3 3 3.0</span>
<span class="sd"> 4 3 3.0</span>
<span class="sd"> 5 3 NaN</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;)[&#39;B&#39;].value_counts().sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> A B</span>
<span class="sd"> 1 1.0 1</span>
<span class="sd"> 2 1.0 1</span>
<span class="sd"> 2.0 1</span>
<span class="sd"> 3 3.0 2</span>
<span class="sd"> Name: B, dtype: int64</span>
<span class="sd"> Don&#39;t include counts of NaN when dropna is False.</span>
<span class="sd"> &gt;&gt;&gt; df.groupby(&#39;A&#39;)[&#39;B&#39;].value_counts(</span>
<span class="sd"> ... dropna=False).sort_index() # doctest: +NORMALIZE_WHITESPACE</span>
<span class="sd"> A B</span>
<span class="sd"> 1 1.0 1</span>
<span class="sd"> 2 1.0 1</span>
<span class="sd"> 2.0 1</span>
<span class="sd"> 3 3.0 2</span>
<span class="sd"> NaN 1</span>
<span class="sd"> Name: B, dtype: int64</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">groupkeys</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span>
<span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">))]</span>
<span class="n">groupkey_cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span>
<span class="n">agg_column</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_cols</span><span class="p">)</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">withColumnRenamed</span><span class="p">(</span><span class="s2">&quot;count&quot;</span><span class="p">,</span> <span class="n">agg_column</span><span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span><span class="p">:</span>
<span class="n">_groupkey_column_names</span> <span class="o">=</span> <span class="n">groupkey_names</span><span class="p">[:</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span><span class="n">subset</span><span class="o">=</span><span class="n">_groupkey_column_names</span><span class="p">)</span>
<span class="k">if</span> <span class="n">dropna</span><span class="p">:</span>
<span class="n">_agg_columns_names</span> <span class="o">=</span> <span class="n">groupkey_names</span><span class="p">[</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span> <span class="p">:]</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span><span class="n">subset</span><span class="o">=</span><span class="n">_agg_columns_names</span><span class="p">)</span>
<span class="k">if</span> <span class="n">sort</span><span class="p">:</span>
<span class="k">if</span> <span class="n">ascending</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">agg_column</span><span class="p">)</span><span class="o">.</span><span class="n">asc</span><span class="p">())</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">agg_column</span><span class="p">)</span><span class="o">.</span><span class="n">desc</span><span class="p">())</span>
<span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span>
<span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span>
<span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span>
<span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">groupkeys</span><span class="p">],</span>
<span class="n">index_fields</span><span class="o">=</span><span class="p">[</span>
<span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span>
<span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span>
<span class="p">],</span>
<span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_column_label</span><span class="p">],</span>
<span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">agg_column</span><span class="p">)],</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div>
<div class="viewcode-block" id="SeriesGroupBy.unique"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.SeriesGroupBy.unique.html#pyspark.pandas.groupby.SeriesGroupBy.unique">[docs]</a> <span class="k">def</span> <span class="nf">unique</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Series</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Return unique values in group.</span>
<span class="sd"> Uniques are returned in order of unknown. It does NOT sort.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.Series.unique</span>
<span class="sd"> pyspark.pandas.Index.unique</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = ps.DataFrame({&#39;a&#39;: [1, 1, 1, 2, 2, 2, 3, 3, 3],</span>
<span class="sd"> ... &#39;b&#39;: [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=[&#39;a&#39;, &#39;b&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.groupby([&#39;a&#39;])[&#39;b&#39;].unique().sort_index() # doctest: +SKIP</span>
<span class="sd"> a</span>
<span class="sd"> 1 [1, 2]</span>
<span class="sd"> 2 [2, 3]</span>
<span class="sd"> 3 [3, 4]</span>
<span class="sd"> Name: b, dtype: object</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">collect_set</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">is_multi_agg_with_relabel</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Check whether the kwargs pass to .agg look like multi-agg with relabling.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> **kwargs : dict</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> bool</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; is_multi_agg_with_relabel(a=&#39;max&#39;)</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; is_multi_agg_with_relabel(a_max=(&#39;a&#39;, &#39;max&#39;),</span>
<span class="sd"> ... a_min=(&#39;a&#39;, &#39;min&#39;))</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; is_multi_agg_with_relabel()</span>
<span class="sd"> False</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">kwargs</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">False</span>
<span class="k">return</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">v</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">values</span><span class="p">())</span>
<span class="k">def</span> <span class="nf">normalize_keyword_aggregation</span><span class="p">(</span>
<span class="n">kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="nb">str</span><span class="p">]],</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="n">Tuple</span><span class="p">]]:</span>
<span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Normalize user-provided kwargs.</span>
<span class="sd"> Transforms from the new ``Dict[str, NamedAgg]`` style kwargs</span>
<span class="sd"> to the old defaultdict[str, List[scalar]].</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> kwargs : dict</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> aggspec : dict</span>
<span class="sd"> The transformed kwargs.</span>
<span class="sd"> columns : List[str]</span>
<span class="sd"> The user-provided keys.</span>
<span class="sd"> order : List[Tuple[str, str]]</span>
<span class="sd"> Pairs of the input and output column names.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; normalize_keyword_aggregation({&#39;output&#39;: (&#39;input&#39;, &#39;sum&#39;)})</span>
<span class="sd"> (defaultdict(&lt;class &#39;list&#39;&gt;, {&#39;input&#39;: [&#39;sum&#39;]}), [&#39;output&#39;], [(&#39;input&#39;, &#39;sum&#39;)])</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">aggspec</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">list</span><span class="p">)</span>
<span class="n">order</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Tuple</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span>
<span class="n">columns</span><span class="p">,</span> <span class="n">pairs</span> <span class="o">=</span> <span class="nb">zip</span><span class="p">(</span><span class="o">*</span><span class="n">kwargs</span><span class="o">.</span><span class="n">items</span><span class="p">())</span>
<span class="k">for</span> <span class="n">column</span><span class="p">,</span> <span class="n">aggfunc</span> <span class="ow">in</span> <span class="n">pairs</span><span class="p">:</span>
<span class="k">if</span> <span class="n">column</span> <span class="ow">in</span> <span class="n">aggspec</span><span class="p">:</span>
<span class="n">aggspec</span><span class="p">[</span><span class="n">column</span><span class="p">]</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">aggfunc</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">aggspec</span><span class="p">[</span><span class="n">column</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="n">aggfunc</span><span class="p">]</span>
<span class="n">order</span><span class="o">.</span><span class="n">append</span><span class="p">((</span><span class="n">column</span><span class="p">,</span> <span class="n">aggfunc</span><span class="p">))</span>
<span class="c1"># For MultiIndex, we need to flatten the tuple, e.g. ((&#39;y&#39;, &#39;A&#39;), &#39;max&#39;) needs to be</span>
<span class="c1"># flattened to (&#39;y&#39;, &#39;A&#39;, &#39;max&#39;), it won&#39;t do anything on normal Index.</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">order</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">],</span> <span class="nb">tuple</span><span class="p">):</span>
<span class="n">order</span> <span class="o">=</span> <span class="p">[(</span><span class="o">*</span><span class="n">levs</span><span class="p">,</span> <span class="n">method</span><span class="p">)</span> <span class="k">for</span> <span class="n">levs</span><span class="p">,</span> <span class="n">method</span> <span class="ow">in</span> <span class="n">order</span><span class="p">]</span>
<span class="k">return</span> <span class="n">aggspec</span><span class="p">,</span> <span class="nb">list</span><span class="p">(</span><span class="n">columns</span><span class="p">),</span> <span class="n">order</span>
<span class="k">def</span> <span class="nf">_test</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">doctest</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">import</span> <span class="nn">numpy</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span>
<span class="kn">import</span> <span class="nn">pyspark.pandas.groupby</span>
<span class="n">os</span><span class="o">.</span><span class="n">chdir</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;SPARK_HOME&quot;</span><span class="p">])</span>
<span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">groupby</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;np&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">numpy</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;ps&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span>
<span class="n">spark</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">&quot;local[4]&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">&quot;pyspark.pandas.groupby tests&quot;</span><span class="p">)</span>
<span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span>
<span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">groupby</span><span class="p">,</span>
<span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span>
<span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
<span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span>
<span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
<span class="n">_test</span><span class="p">()</span>
</pre></div>
</div>
<div class='prev-next-bottom'>
</div>
</main>
</div>
</div>
<script src="../../../_static/js/index.3da636dd464baa7582d2.js"></script>
<footer class="footer mt-5 mt-md-0">
<div class="container">
<p>
&copy; Copyright .<br/>
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/>
</p>
</div>
</footer>
</body>
</html>