| |
| <!DOCTYPE html> |
| |
| <html> |
| <head> |
| <meta charset="utf-8" /> |
| <title>pyspark.pandas.groupby — PySpark 3.3.1 documentation</title> |
| |
| <link rel="stylesheet" href="../../../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css"> |
| |
| |
| <link rel="stylesheet" |
| href="../../../_static/vendor/fontawesome/5.13.0/css/all.min.css"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2"> |
| |
| |
| |
| <link rel="stylesheet" |
| href="../../../_static/vendor/open-sans_all/1.44.1/index.css"> |
| <link rel="stylesheet" |
| href="../../../_static/vendor/lato_latin-ext/1.44.1/index.css"> |
| |
| |
| <link rel="stylesheet" href="../../../_static/basic.css" type="text/css" /> |
| <link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" /> |
| |
| <link rel="preload" as="script" href="../../../_static/js/index.3da636dd464baa7582d2.js"> |
| |
| <script id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script> |
| <script src="../../../_static/jquery.js"></script> |
| <script src="../../../_static/underscore.js"></script> |
| <script src="../../../_static/doctools.js"></script> |
| <script src="../../../_static/language_data.js"></script> |
| <script src="../../../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script> |
| <script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script> |
| <link rel="search" title="Search" href="../../../search.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="en" /> |
| </head> |
| <body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80"> |
| |
| <nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"> |
| <div class="container-xl"> |
| |
| <a class="navbar-brand" href="../../../index.html"> |
| |
| <img src="../../../_static/spark-logo-reverse.png" class="logo" alt="logo" /> |
| |
| </a> |
| <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| |
| <div id="navbar-menu" class="col-lg-9 collapse navbar-collapse"> |
| <ul id="navbar-main-elements" class="navbar-nav mr-auto"> |
| |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../getting_started/index.html">Getting Started</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../user_guide/index.html">User Guide</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../reference/index.html">API Reference</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../development/index.html">Development</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../../../migration_guide/index.html">Migration Guide</a> |
| </li> |
| |
| |
| </ul> |
| |
| |
| |
| |
| <ul class="navbar-nav"> |
| |
| |
| </ul> |
| </div> |
| </div> |
| </nav> |
| |
| |
| <div class="container-xl"> |
| <div class="row"> |
| |
| <div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../../../search.html" method="get"> |
| <i class="icon fas fa-search"></i> |
| <input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" > |
| </form> |
| <nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation"> |
| |
| <div class="bd-toc-item active"> |
| |
| |
| <ul class="nav bd-sidenav"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </ul> |
| |
| </nav> |
| </div> |
| |
| |
| |
| <div class="d-none d-xl-block col-xl-2 bd-toc"> |
| |
| |
| <nav id="bd-toc-nav"> |
| <ul class="nav section-nav flex-column"> |
| |
| </ul> |
| </nav> |
| |
| |
| |
| </div> |
| |
| |
| |
| <main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main"> |
| |
| <div> |
| |
| <h1>Source code for pyspark.pandas.groupby</h1><div class="highlight"><pre> |
| <span></span><span class="c1">#</span> |
| <span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span> |
| <span class="c1"># contributor license agreements. See the NOTICE file distributed with</span> |
| <span class="c1"># this work for additional information regarding copyright ownership.</span> |
| <span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span> |
| <span class="c1"># (the "License"); you may not use this file except in compliance with</span> |
| <span class="c1"># the License. You may obtain a copy of the License at</span> |
| <span class="c1">#</span> |
| <span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span> |
| <span class="c1">#</span> |
| <span class="c1"># Unless required by applicable law or agreed to in writing, software</span> |
| <span class="c1"># distributed under the License is distributed on an "AS IS" BASIS,</span> |
| <span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span> |
| <span class="c1"># See the License for the specific language governing permissions and</span> |
| <span class="c1"># limitations under the License.</span> |
| <span class="c1">#</span> |
| |
| <span class="sd">"""</span> |
| <span class="sd">A wrapper for GroupedData to behave similar to pandas GroupBy.</span> |
| <span class="sd">"""</span> |
| |
| <span class="kn">from</span> <span class="nn">abc</span> <span class="kn">import</span> <span class="n">ABCMeta</span><span class="p">,</span> <span class="n">abstractmethod</span> |
| <span class="kn">import</span> <span class="nn">inspect</span> |
| <span class="kn">from</span> <span class="nn">collections</span> <span class="kn">import</span> <span class="n">defaultdict</span><span class="p">,</span> <span class="n">namedtuple</span> |
| <span class="kn">from</span> <span class="nn">distutils.version</span> <span class="kn">import</span> <span class="n">LooseVersion</span> |
| <span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">partial</span> |
| <span class="kn">from</span> <span class="nn">itertools</span> <span class="kn">import</span> <span class="n">product</span> |
| <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">Any</span><span class="p">,</span> |
| <span class="n">Callable</span><span class="p">,</span> |
| <span class="n">Dict</span><span class="p">,</span> |
| <span class="n">Generic</span><span class="p">,</span> |
| <span class="n">Iterator</span><span class="p">,</span> |
| <span class="n">Mapping</span><span class="p">,</span> |
| <span class="n">List</span><span class="p">,</span> |
| <span class="n">Optional</span><span class="p">,</span> |
| <span class="n">Sequence</span><span class="p">,</span> |
| <span class="n">Set</span><span class="p">,</span> |
| <span class="n">Tuple</span><span class="p">,</span> |
| <span class="n">Union</span><span class="p">,</span> |
| <span class="n">cast</span><span class="p">,</span> |
| <span class="n">TYPE_CHECKING</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">import</span> <span class="nn">warnings</span> |
| |
| <span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span> |
| <span class="kn">from</span> <span class="nn">pandas.api.types</span> <span class="kn">import</span> <span class="n">is_hashable</span><span class="p">,</span> <span class="n">is_list_like</span> <span class="c1"># type: ignore[attr-defined]</span> |
| |
| <span class="k">if</span> <span class="n">LooseVersion</span><span class="p">(</span><span class="n">pd</span><span class="o">.</span><span class="n">__version__</span><span class="p">)</span> <span class="o">>=</span> <span class="n">LooseVersion</span><span class="p">(</span><span class="s2">"1.3.0"</span><span class="p">):</span> |
| <span class="kn">from</span> <span class="nn">pandas.core.common</span> <span class="kn">import</span> <span class="n">_builtin_table</span> <span class="c1"># type: ignore[attr-defined]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">pandas.core.base</span> <span class="kn">import</span> <span class="n">SelectionMixin</span> |
| |
| <span class="n">_builtin_table</span> <span class="o">=</span> <span class="n">SelectionMixin</span><span class="o">.</span><span class="n">_builtin_table</span> <span class="c1"># type: ignore[attr-defined]</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Column</span><span class="p">,</span> <span class="n">DataFrame</span> <span class="k">as</span> <span class="n">SparkDataFrame</span><span class="p">,</span> <span class="n">Window</span><span class="p">,</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">F</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">NumericType</span><span class="p">,</span> |
| <span class="n">StructField</span><span class="p">,</span> |
| <span class="n">StructType</span><span class="p">,</span> |
| <span class="n">StringType</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">pandas</span> <span class="k">as</span> <span class="n">ps</span> <span class="c1"># For running doctests and reference resolution in PyCharm.</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas._typing</span> <span class="kn">import</span> <span class="n">Axis</span><span class="p">,</span> <span class="n">FrameLike</span><span class="p">,</span> <span class="n">Label</span><span class="p">,</span> <span class="n">Name</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.typedef</span> <span class="kn">import</span> <span class="n">infer_return_type</span><span class="p">,</span> <span class="n">DataFrameType</span><span class="p">,</span> <span class="n">ScalarType</span><span class="p">,</span> <span class="n">SeriesType</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.frame</span> <span class="kn">import</span> <span class="n">DataFrame</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.internal</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">InternalField</span><span class="p">,</span> |
| <span class="n">InternalFrame</span><span class="p">,</span> |
| <span class="n">HIDDEN_COLUMNS</span><span class="p">,</span> |
| <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span> |
| <span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">,</span> |
| <span class="n">SPARK_DEFAULT_SERIES_NAME</span><span class="p">,</span> |
| <span class="n">SPARK_INDEX_NAME_PATTERN</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.missing.groupby</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">MissingPandasLikeDataFrameGroupBy</span><span class="p">,</span> |
| <span class="n">MissingPandasLikeSeriesGroupBy</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.series</span> <span class="kn">import</span> <span class="n">Series</span><span class="p">,</span> <span class="n">first_series</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.spark</span> <span class="kn">import</span> <span class="n">functions</span> <span class="k">as</span> <span class="n">SF</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.config</span> <span class="kn">import</span> <span class="n">get_option</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.utils</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">align_diff_frames</span><span class="p">,</span> |
| <span class="n">is_name_like_tuple</span><span class="p">,</span> |
| <span class="n">is_name_like_value</span><span class="p">,</span> |
| <span class="n">name_like_string</span><span class="p">,</span> |
| <span class="n">same_anchor</span><span class="p">,</span> |
| <span class="n">scol_for</span><span class="p">,</span> |
| <span class="n">verify_temp_column_name</span><span class="p">,</span> |
| <span class="n">log_advice</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.spark.utils</span> <span class="kn">import</span> <span class="n">as_nullable_spark_type</span><span class="p">,</span> <span class="n">force_decimal_precision_scale</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.exceptions</span> <span class="kn">import</span> <span class="n">DataError</span> |
| |
| <span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">RollingGroupby</span><span class="p">,</span> <span class="n">ExpandingGroupby</span> |
| |
| |
| <span class="c1"># to keep it the same as pandas</span> |
| <span class="n">NamedAgg</span> <span class="o">=</span> <span class="n">namedtuple</span><span class="p">(</span><span class="s2">"NamedAgg"</span><span class="p">,</span> <span class="p">[</span><span class="s2">"column"</span><span class="p">,</span> <span class="s2">"aggfunc"</span><span class="p">])</span> |
| |
| |
| <span class="k">class</span> <span class="nc">GroupBy</span><span class="p">(</span><span class="n">Generic</span><span class="p">[</span><span class="n">FrameLike</span><span class="p">],</span> <span class="n">metaclass</span><span class="o">=</span><span class="n">ABCMeta</span><span class="p">):</span> |
| <span class="sd">"""</span> |
| <span class="sd"> :ivar _psdf: The parent dataframe that is used to perform the groupby</span> |
| <span class="sd"> :type _psdf: DataFrame</span> |
| <span class="sd"> :ivar _groupkeys: The list of keys that will be used to perform the grouping</span> |
| <span class="sd"> :type _groupkeys: List[Series]</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> |
| <span class="n">groupkeys</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span> |
| <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> |
| <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> |
| <span class="n">column_labels_to_exclude</span><span class="p">:</span> <span class="n">Set</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span> |
| <span class="n">agg_columns_selected</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> |
| <span class="n">agg_columns</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span> |
| <span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span> <span class="o">=</span> <span class="n">psdf</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span> <span class="o">=</span> <span class="n">groupkeys</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span> <span class="o">=</span> <span class="n">as_index</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span> <span class="o">=</span> <span class="n">dropna</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_column_labels_to_exclude</span> <span class="o">=</span> <span class="n">column_labels_to_exclude</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span> <span class="o">=</span> <span class="n">agg_columns_selected</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> <span class="o">=</span> <span class="n">agg_columns</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">_groupkeys_scols</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">]:</span> |
| <span class="k">return</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">]</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">_agg_columns_scols</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">]:</span> |
| <span class="k">return</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">]</span> |
| |
| <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="nf">_apply_series_op</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">op</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">"SeriesGroupBy"</span><span class="p">],</span> <span class="n">Series</span><span class="p">],</span> |
| <span class="n">should_resolve</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="k">pass</span> |
| |
| <span class="nd">@abstractmethod</span> |
| <span class="k">def</span> <span class="nf">_cleanup_and_return</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="k">pass</span> |
| |
| <span class="c1"># TODO: Series support is not implemented yet.</span> |
| <span class="c1"># TODO: not all arguments are implemented comparing to pandas' for now.</span> |
| <span class="k">def</span> <span class="nf">aggregate</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">func_or_funcs</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> |
| <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="sd">"""Aggregate using one or more operations over the specified axis.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> func_or_funcs : dict, str or list</span> |
| <span class="sd"> a dict mapping from column name (string) to</span> |
| <span class="sd"> aggregate functions (string or list of strings).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| |
| <span class="sd"> The return can be:</span> |
| |
| <span class="sd"> * Series : when DataFrame.agg is called with a single function</span> |
| <span class="sd"> * DataFrame : when DataFrame.agg is called with several functions</span> |
| |
| <span class="sd"> Return Series or DataFrame.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> `agg` is an alias for `aggregate`. Use the alias.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 1, 2, 2],</span> |
| <span class="sd"> ... 'B': [1, 2, 3, 4],</span> |
| <span class="sd"> ... 'C': [0.362, 0.227, 1.267, -0.562]},</span> |
| <span class="sd"> ... columns=['A', 'B', 'C'])</span> |
| |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 0 1 1 0.362</span> |
| <span class="sd"> 1 1 2 0.227</span> |
| <span class="sd"> 2 2 3 1.267</span> |
| <span class="sd"> 3 2 4 -0.562</span> |
| |
| <span class="sd"> Different aggregations per column</span> |
| |
| <span class="sd"> >>> aggregated = df.groupby('A').agg({'B': 'min', 'C': 'sum'})</span> |
| <span class="sd"> >>> aggregated[['B', 'C']].sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 1 0.589</span> |
| <span class="sd"> 2 3 0.705</span> |
| |
| <span class="sd"> >>> aggregated = df.groupby('A').agg({'B': ['min', 'max']})</span> |
| <span class="sd"> >>> aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B</span> |
| <span class="sd"> min max</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 1 2</span> |
| <span class="sd"> 2 3 4</span> |
| |
| <span class="sd"> >>> aggregated = df.groupby('A').agg('min')</span> |
| <span class="sd"> >>> aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 1 0.227</span> |
| <span class="sd"> 2 3 -0.562</span> |
| |
| <span class="sd"> >>> aggregated = df.groupby('A').agg(['min', 'max'])</span> |
| <span class="sd"> >>> aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> min max min max</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 1 2 0.227 0.362</span> |
| <span class="sd"> 2 3 4 -0.562 1.267</span> |
| |
| <span class="sd"> To control the output names with different aggregations per column, pandas-on-Spark</span> |
| <span class="sd"> also supports 'named aggregation' or nested renaming in .agg. It can also be</span> |
| <span class="sd"> used when applying multiple aggregation functions to specific columns.</span> |
| |
| <span class="sd"> >>> aggregated = df.groupby('A').agg(b_max=ps.NamedAgg(column='B', aggfunc='max'))</span> |
| <span class="sd"> >>> aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> b_max</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 4</span> |
| |
| <span class="sd"> >>> aggregated = df.groupby('A').agg(b_max=('B', 'max'), b_min=('B', 'min'))</span> |
| <span class="sd"> >>> aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> b_max b_min</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 2 1</span> |
| <span class="sd"> 2 4 3</span> |
| |
| <span class="sd"> >>> aggregated = df.groupby('A').agg(b_max=('B', 'max'), c_min=('C', 'min'))</span> |
| <span class="sd"> >>> aggregated.sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> b_max c_min</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 2 0.227</span> |
| <span class="sd"> 2 4 -0.562</span> |
| <span class="sd"> """</span> |
| <span class="c1"># I think current implementation of func and arguments in pandas-on-Spark for aggregate</span> |
| <span class="c1"># is different than pandas, later once arguments are added, this could be removed.</span> |
| <span class="k">if</span> <span class="n">func_or_funcs</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">kwargs</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"No aggregation argument or function specified."</span><span class="p">)</span> |
| |
| <span class="n">relabeling</span> <span class="o">=</span> <span class="n">func_or_funcs</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="n">is_multi_agg_with_relabel</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">relabeling</span><span class="p">:</span> |
| <span class="p">(</span> |
| <span class="n">func_or_funcs</span><span class="p">,</span> |
| <span class="n">columns</span><span class="p">,</span> |
| <span class="n">order</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">=</span> <span class="n">normalize_keyword_aggregation</span><span class="p">(</span> <span class="c1"># type: ignore[assignment]</span> |
| <span class="n">kwargs</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">func_or_funcs</span><span class="p">,</span> <span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="nb">list</span><span class="p">)):</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">func_or_funcs</span><span class="p">,</span> <span class="nb">dict</span><span class="p">)</span> <span class="ow">or</span> <span class="ow">not</span> <span class="nb">all</span><span class="p">(</span> |
| <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> |
| <span class="ow">and</span> <span class="p">(</span> |
| <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> |
| <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span> |
| <span class="ow">and</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">value</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">for</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">func_or_funcs</span><span class="o">.</span><span class="n">items</span><span class="p">()</span> |
| <span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"aggs must be a dict mapping from column name "</span> |
| <span class="s2">"to aggregate functions (string or list of strings)."</span> |
| <span class="p">)</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">agg_cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">col</span><span class="o">.</span><span class="n">name</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">]</span> |
| <span class="n">func_or_funcs</span> <span class="o">=</span> <span class="p">{</span><span class="n">col</span><span class="p">:</span> <span class="n">func_or_funcs</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">agg_cols</span><span class="p">}</span> |
| |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_groupby</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span> <span class="n">func_or_funcs</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span><span class="p">:</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span> |
| <span class="n">subset</span><span class="o">=</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span><span class="p">:</span> |
| <span class="n">should_drop_index</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span> |
| <span class="n">i</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">gkey</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span> <span class="k">if</span> <span class="n">gkey</span><span class="o">.</span><span class="n">_psdf</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">should_drop_index</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">level</span><span class="o">=</span><span class="n">should_drop_index</span><span class="p">,</span> <span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">should_drop_index</span><span class="p">)</span> <span class="o"><</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">):</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span> |
| |
| <span class="k">if</span> <span class="n">relabeling</span><span class="p">:</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">order</span><span class="p">]</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">columns</span> <span class="c1"># type: ignore[assignment]</span> |
| <span class="k">return</span> <span class="n">psdf</span> |
| |
| <span class="n">agg</span> <span class="o">=</span> <span class="n">aggregate</span> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_spark_groupby</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> |
| <span class="n">func</span><span class="p">:</span> <span class="n">Mapping</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]],</span> |
| <span class="n">groupkeys</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Series</span><span class="p">]</span> <span class="o">=</span> <span class="p">(),</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">InternalFrame</span><span class="p">:</span> |
| <span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">))]</span> |
| <span class="n">groupkey_scols</span> <span class="o">=</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)]</span> |
| |
| <span class="n">multi_aggs</span> <span class="o">=</span> <span class="nb">any</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="nb">list</span><span class="p">)</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">func</span><span class="o">.</span><span class="n">values</span><span class="p">())</span> |
| <span class="n">reordered</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">data_columns</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span> <span class="ow">in</span> <span class="n">func</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> |
| <span class="n">label</span> <span class="o">=</span> <span class="n">key</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">key</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">key</span><span class="p">,)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">!=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"The length of the key must be the same as the column label level."</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">aggfunc</span> <span class="ow">in</span> <span class="p">[</span><span class="n">value</span><span class="p">]</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">value</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="n">value</span><span class="p">:</span> |
| <span class="n">column_label</span> <span class="o">=</span> <span class="nb">tuple</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">+</span> <span class="p">[</span><span class="n">aggfunc</span><span class="p">])</span> <span class="k">if</span> <span class="n">multi_aggs</span> <span class="k">else</span> <span class="n">label</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">column_label</span><span class="p">)</span> |
| |
| <span class="n">data_col</span> <span class="o">=</span> <span class="n">name_like_string</span><span class="p">(</span><span class="n">column_label</span><span class="p">)</span> |
| <span class="n">data_columns</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">data_col</span><span class="p">)</span> |
| |
| <span class="n">col_name</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_name_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">aggfunc</span> <span class="o">==</span> <span class="s2">"nunique"</span><span class="p">:</span> |
| <span class="n">reordered</span><span class="o">.</span><span class="n">append</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">expr</span><span class="p">(</span><span class="s2">"count(DISTINCT `</span><span class="si">{0}</span><span class="s2">`) as `</span><span class="si">{1}</span><span class="s2">`"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">col_name</span><span class="p">,</span> <span class="n">data_col</span><span class="p">))</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># Implement "quartiles" aggregate function for ``describe``.</span> |
| <span class="k">elif</span> <span class="n">aggfunc</span> <span class="o">==</span> <span class="s2">"quartiles"</span><span class="p">:</span> |
| <span class="n">reordered</span><span class="o">.</span><span class="n">append</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">expr</span><span class="p">(</span> |
| <span class="s2">"percentile_approx(`</span><span class="si">{0}</span><span class="s2">`, array(0.25, 0.5, 0.75)) as `</span><span class="si">{1}</span><span class="s2">`"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">col_name</span><span class="p">,</span> <span class="n">data_col</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">reordered</span><span class="o">.</span><span class="n">append</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">expr</span><span class="p">(</span><span class="s2">"</span><span class="si">{1}</span><span class="s2">(`</span><span class="si">{0}</span><span class="s2">`) as `</span><span class="si">{2}</span><span class="s2">`"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">col_name</span><span class="p">,</span> <span class="n">aggfunc</span><span class="p">,</span> <span class="n">data_col</span><span class="p">))</span> |
| <span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="n">groupkey_scols</span> <span class="o">+</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">reordered</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">groupkeys</span><span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">,</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_columns</span><span class="p">],</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="GroupBy.count"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.count.html#pyspark.pandas.groupby.GroupBy.count">[docs]</a> <span class="k">def</span> <span class="nf">count</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute count of group, excluding missing values.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 1, 2, 1, 2],</span> |
| <span class="sd"> ... 'B': [np.nan, 2, 3, 4, 5],</span> |
| <span class="sd"> ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C'])</span> |
| <span class="sd"> >>> df.groupby('A').count().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 2 3</span> |
| <span class="sd"> 2 2 2</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></div> |
| |
| <span class="c1"># TODO: We should fix See Also when Series implementation is finished.</span> |
| <div class="viewcode-block" id="GroupBy.first"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.first.html#pyspark.pandas.groupby.GroupBy.first">[docs]</a> <span class="k">def</span> <span class="nf">first</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute first of group values.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">first</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.last"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.last.html#pyspark.pandas.groupby.GroupBy.last">[docs]</a> <span class="k">def</span> <span class="nf">last</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute last of group values.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">last</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="n">ignorenulls</span><span class="o">=</span><span class="kc">True</span><span class="p">),</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">False</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.max"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.max.html#pyspark.pandas.groupby.GroupBy.max">[docs]</a> <span class="k">def</span> <span class="nf">max</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute max of group values.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></div> |
| |
| <span class="c1"># TODO: examples should be updated.</span> |
| <div class="viewcode-block" id="GroupBy.mean"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.mean.html#pyspark.pandas.groupby.GroupBy.mean">[docs]</a> <span class="k">def</span> <span class="nf">mean</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute mean of groups, excluding missing values.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> pyspark.pandas.Series or pyspark.pandas.DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 1, 2, 1, 2],</span> |
| <span class="sd"> ... 'B': [np.nan, 2, 3, 4, 5],</span> |
| <span class="sd"> ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C'])</span> |
| |
| <span class="sd"> Groupby one column and return the mean of the remaining columns in</span> |
| <span class="sd"> each group.</span> |
| |
| <span class="sd"> >>> df.groupby('A').mean().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 3.0 1.333333</span> |
| <span class="sd"> 2 4.0 1.500000</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">mean</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.min"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.min.html#pyspark.pandas.groupby.GroupBy.min">[docs]</a> <span class="k">def</span> <span class="nf">min</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute min of group values.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></div> |
| |
| <span class="c1"># TODO: sync the doc.</span> |
| <div class="viewcode-block" id="GroupBy.std"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.std.html#pyspark.pandas.groupby.GroupBy.std">[docs]</a> <span class="k">def</span> <span class="nf">std</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ddof</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute standard deviation of groups, excluding missing values.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> ddof : int, default 1</span> |
| <span class="sd"> Delta Degrees of Freedom. The divisor used in calculations is N - ddof,</span> |
| <span class="sd"> where N represents the number of elements.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="n">ddof</span> <span class="ow">in</span> <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">stddev_pop</span> <span class="k">if</span> <span class="n">ddof</span> <span class="o">==</span> <span class="mi">0</span> <span class="k">else</span> <span class="n">F</span><span class="o">.</span><span class="n">stddev_samp</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">True</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.sum"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.sum.html#pyspark.pandas.groupby.GroupBy.sum">[docs]</a> <span class="k">def</span> <span class="nf">sum</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute sum of group values</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">sum</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></div> |
| |
| <span class="c1"># TODO: sync the doc.</span> |
| <div class="viewcode-block" id="GroupBy.var"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.var.html#pyspark.pandas.groupby.GroupBy.var">[docs]</a> <span class="k">def</span> <span class="nf">var</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ddof</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute variance of groups, excluding missing values.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> ddof : int, default 1</span> |
| <span class="sd"> Delta Degrees of Freedom. The divisor used in calculations is N - ddof,</span> |
| <span class="sd"> where N represents the number of elements.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| <span class="sd"> """</span> |
| <span class="k">assert</span> <span class="n">ddof</span> <span class="ow">in</span> <span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">var_pop</span> <span class="k">if</span> <span class="n">ddof</span> <span class="o">==</span> <span class="mi">0</span> <span class="k">else</span> <span class="n">F</span><span class="o">.</span><span class="n">var_samp</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">True</span> |
| <span class="p">)</span></div> |
| |
| <span class="c1"># TODO: skipna should be implemented.</span> |
| <div class="viewcode-block" id="GroupBy.all"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.all.html#pyspark.pandas.groupby.GroupBy.all">[docs]</a> <span class="k">def</span> <span class="nf">all</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns True if all values in the group are truthful, else False.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],</span> |
| <span class="sd"> ... 'B': [True, True, True, False, False,</span> |
| <span class="sd"> ... False, None, True, None, False]},</span> |
| <span class="sd"> ... columns=['A', 'B'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 True</span> |
| <span class="sd"> 1 1 True</span> |
| <span class="sd"> 2 2 True</span> |
| <span class="sd"> 3 2 False</span> |
| <span class="sd"> 4 3 False</span> |
| <span class="sd"> 5 3 False</span> |
| <span class="sd"> 6 4 None</span> |
| <span class="sd"> 7 4 True</span> |
| <span class="sd"> 8 5 None</span> |
| <span class="sd"> 9 5 False</span> |
| |
| <span class="sd"> >>> df.groupby('A').all().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 True</span> |
| <span class="sd"> 2 False</span> |
| <span class="sd"> 3 False</span> |
| <span class="sd"> 4 True</span> |
| <span class="sd"> 5 False</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">"boolean"</span><span class="p">),</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">True</span><span class="p">))),</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">False</span> |
| <span class="p">)</span></div> |
| |
| <span class="c1"># TODO: skipna should be implemented.</span> |
| <div class="viewcode-block" id="GroupBy.any"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.any.html#pyspark.pandas.groupby.GroupBy.any">[docs]</a> <span class="k">def</span> <span class="nf">any</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Returns True if any value in the group is truthful, else False.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],</span> |
| <span class="sd"> ... 'B': [True, True, True, False, False,</span> |
| <span class="sd"> ... False, None, True, None, False]},</span> |
| <span class="sd"> ... columns=['A', 'B'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 True</span> |
| <span class="sd"> 1 1 True</span> |
| <span class="sd"> 2 2 True</span> |
| <span class="sd"> 3 2 False</span> |
| <span class="sd"> 4 3 False</span> |
| <span class="sd"> 5 3 False</span> |
| <span class="sd"> 6 4 None</span> |
| <span class="sd"> 7 4 True</span> |
| <span class="sd"> 8 5 None</span> |
| <span class="sd"> 9 5 False</span> |
| |
| <span class="sd"> >>> df.groupby('A').any().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 True</span> |
| <span class="sd"> 2 True</span> |
| <span class="sd"> 3 False</span> |
| <span class="sd"> 4 True</span> |
| <span class="sd"> 5 False</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">col</span><span class="p">:</span> <span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">cast</span><span class="p">(</span><span class="s2">"boolean"</span><span class="p">),</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">False</span><span class="p">))),</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">False</span> |
| <span class="p">)</span></div> |
| |
| <span class="c1"># TODO: groupby multiply columns should be implemented.</span> |
| <div class="viewcode-block" id="GroupBy.size"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.size.html#pyspark.pandas.groupby.GroupBy.size">[docs]</a> <span class="k">def</span> <span class="nf">size</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Series</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute group sizes.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 2, 2, 3, 3, 3],</span> |
| <span class="sd"> ... 'B': [1, 1, 2, 3, 3, 3]},</span> |
| <span class="sd"> ... columns=['A', 'B'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 1</span> |
| <span class="sd"> 1 2 1</span> |
| <span class="sd"> 2 2 2</span> |
| <span class="sd"> 3 3 3</span> |
| <span class="sd"> 4 3 3</span> |
| <span class="sd"> 5 3 3</span> |
| |
| <span class="sd"> >>> df.groupby('A').size().sort_index()</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> 3 3</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> >>> df.groupby(['A', 'B']).size().sort_index()</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 1 1 1</span> |
| <span class="sd"> 2 1 1</span> |
| <span class="sd"> 2 1</span> |
| <span class="sd"> 3 3 3</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> For Series,</span> |
| |
| <span class="sd"> >>> df.B.groupby(df.A).size().sort_index()</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> 3 3</span> |
| <span class="sd"> Name: B, dtype: int64</span> |
| |
| <span class="sd"> >>> df.groupby(df.A).B.size().sort_index()</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> 3 3</span> |
| <span class="sd"> Name: B, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="n">groupkeys</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span> |
| <span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">))]</span> |
| <span class="n">groupkey_scols</span> <span class="o">=</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)]</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="n">groupkey_scols</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_columns</span> |
| <span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">count</span><span class="p">()</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">groupkeys</span><span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="kc">None</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">"count"</span><span class="p">)],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.diff"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.diff.html#pyspark.pandas.groupby.GroupBy.diff">[docs]</a> <span class="k">def</span> <span class="nf">diff</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> First discrete difference of element.</span> |
| |
| <span class="sd"> Calculates the difference of a DataFrame element compared with another element in the</span> |
| <span class="sd"> DataFrame group (default is the element in the same column of the previous row).</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> periods : int, default 1</span> |
| <span class="sd"> Periods to shift for calculating difference, accepts negative values.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> diffed : DataFrame or Series</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 2, 3, 4, 5, 6],</span> |
| <span class="sd"> ... 'b': [1, 1, 2, 3, 5, 8],</span> |
| <span class="sd"> ... 'c': [1, 4, 9, 16, 25, 36]}, columns=['a', 'b', 'c'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 0 1 1 1</span> |
| <span class="sd"> 1 2 1 4</span> |
| <span class="sd"> 2 3 2 9</span> |
| <span class="sd"> 3 4 3 16</span> |
| <span class="sd"> 4 5 5 25</span> |
| <span class="sd"> 5 6 8 36</span> |
| |
| <span class="sd"> >>> df.groupby(['b']).diff().sort_index()</span> |
| <span class="sd"> a c</span> |
| <span class="sd"> 0 NaN NaN</span> |
| <span class="sd"> 1 1.0 3.0</span> |
| <span class="sd"> 2 NaN NaN</span> |
| <span class="sd"> 3 NaN NaN</span> |
| <span class="sd"> 4 NaN NaN</span> |
| <span class="sd"> 5 NaN NaN</span> |
| |
| <span class="sd"> Difference with previous column in a group.</span> |
| |
| <span class="sd"> >>> df.groupby(['b'])['a'].diff().sort_index()</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> 1 1.0</span> |
| <span class="sd"> 2 NaN</span> |
| <span class="sd"> 3 NaN</span> |
| <span class="sd"> 4 NaN</span> |
| <span class="sd"> 5 NaN</span> |
| <span class="sd"> Name: a, dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_diff</span><span class="p">(</span><span class="n">periods</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span> <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.cumcount"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.cumcount.html#pyspark.pandas.groupby.GroupBy.cumcount">[docs]</a> <span class="k">def</span> <span class="nf">cumcount</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="n">Series</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Number each item in each group from 0 to the length of that group - 1.</span> |
| |
| <span class="sd"> Essentially this is equivalent to</span> |
| |
| <span class="sd"> .. code-block:: python</span> |
| |
| <span class="sd"> self.apply(lambda x: pd.Series(np.arange(len(x)), x.index))</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> ascending : bool, default True</span> |
| <span class="sd"> If False, number in reverse, from length of group - 1 to 0.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series</span> |
| <span class="sd"> Sequence number of each element within each group.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],</span> |
| <span class="sd"> ... columns=['A'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A</span> |
| <span class="sd"> 0 a</span> |
| <span class="sd"> 1 a</span> |
| <span class="sd"> 2 a</span> |
| <span class="sd"> 3 b</span> |
| <span class="sd"> 4 b</span> |
| <span class="sd"> 5 a</span> |
| <span class="sd"> >>> df.groupby('A').cumcount().sort_index()</span> |
| <span class="sd"> 0 0</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> 3 0</span> |
| <span class="sd"> 4 1</span> |
| <span class="sd"> 5 3</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> >>> df.groupby('A').cumcount(ascending=False).sort_index()</span> |
| <span class="sd"> 0 3</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 1</span> |
| <span class="sd"> 3 1</span> |
| <span class="sd"> 4 0</span> |
| <span class="sd"> 5 0</span> |
| <span class="sd"> dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="n">ret</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="o">.</span><span class="n">rename</span><span class="p">()</span> |
| <span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="k">lambda</span> <span class="n">_</span><span class="p">:</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="mi">0</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">_cum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">,</span> <span class="n">ascending</span><span class="o">=</span><span class="n">ascending</span><span class="p">)</span> |
| <span class="o">-</span> <span class="mi">1</span> |
| <span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">ret</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.cummax"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.cummax.html#pyspark.pandas.groupby.GroupBy.cummax">[docs]</a> <span class="k">def</span> <span class="nf">cummax</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Cumulative max for each group.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.cummax</span> |
| <span class="sd"> DataFrame.cummax</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame(</span> |
| <span class="sd"> ... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],</span> |
| <span class="sd"> ... columns=list('ABC'))</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 0 1 NaN 4</span> |
| <span class="sd"> 1 1 0.1 3</span> |
| <span class="sd"> 2 1 20.0 2</span> |
| <span class="sd"> 3 4 10.0 1</span> |
| |
| <span class="sd"> By default, iterates over rows and finds the sum in each column.</span> |
| |
| <span class="sd"> >>> df.groupby("A").cummax().sort_index()</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> 0 NaN 4</span> |
| <span class="sd"> 1 0.1 4</span> |
| <span class="sd"> 2 20.0 4</span> |
| <span class="sd"> 3 10.0 1</span> |
| |
| <span class="sd"> It works as below in Series.</span> |
| |
| <span class="sd"> >>> df.C.groupby(df.A).cummax().sort_index()</span> |
| <span class="sd"> 0 4</span> |
| <span class="sd"> 1 4</span> |
| <span class="sd"> 2 4</span> |
| <span class="sd"> 3 1</span> |
| <span class="sd"> Name: C, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_cum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span> |
| <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.cummin"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.cummin.html#pyspark.pandas.groupby.GroupBy.cummin">[docs]</a> <span class="k">def</span> <span class="nf">cummin</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Cumulative min for each group.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.cummin</span> |
| <span class="sd"> DataFrame.cummin</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame(</span> |
| <span class="sd"> ... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],</span> |
| <span class="sd"> ... columns=list('ABC'))</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 0 1 NaN 4</span> |
| <span class="sd"> 1 1 0.1 3</span> |
| <span class="sd"> 2 1 20.0 2</span> |
| <span class="sd"> 3 4 10.0 1</span> |
| |
| <span class="sd"> By default, iterates over rows and finds the sum in each column.</span> |
| |
| <span class="sd"> >>> df.groupby("A").cummin().sort_index()</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> 0 NaN 4</span> |
| <span class="sd"> 1 0.1 3</span> |
| <span class="sd"> 2 0.1 2</span> |
| <span class="sd"> 3 10.0 1</span> |
| |
| <span class="sd"> It works as below in Series.</span> |
| |
| <span class="sd"> >>> df.B.groupby(df.A).cummin().sort_index()</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> 1 0.1</span> |
| <span class="sd"> 2 0.1</span> |
| <span class="sd"> 3 10.0</span> |
| <span class="sd"> Name: B, dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_cum</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">min</span><span class="p">,</span> <span class="kc">True</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span> |
| <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.cumprod"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.cumprod.html#pyspark.pandas.groupby.GroupBy.cumprod">[docs]</a> <span class="k">def</span> <span class="nf">cumprod</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Cumulative product for each group.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.cumprod</span> |
| <span class="sd"> DataFrame.cumprod</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame(</span> |
| <span class="sd"> ... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],</span> |
| <span class="sd"> ... columns=list('ABC'))</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 0 1 NaN 4</span> |
| <span class="sd"> 1 1 0.1 3</span> |
| <span class="sd"> 2 1 20.0 2</span> |
| <span class="sd"> 3 4 10.0 1</span> |
| |
| <span class="sd"> By default, iterates over rows and finds the sum in each column.</span> |
| |
| <span class="sd"> >>> df.groupby("A").cumprod().sort_index()</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> 0 NaN 4</span> |
| <span class="sd"> 1 0.1 12</span> |
| <span class="sd"> 2 2.0 24</span> |
| <span class="sd"> 3 10.0 1</span> |
| |
| <span class="sd"> It works as below in Series.</span> |
| |
| <span class="sd"> >>> df.B.groupby(df.A).cumprod().sort_index()</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> 1 0.1</span> |
| <span class="sd"> 2 2.0</span> |
| <span class="sd"> 3 10.0</span> |
| <span class="sd"> Name: B, dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_cumprod</span><span class="p">(</span><span class="kc">True</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span> |
| <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.cumsum"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.cumsum.html#pyspark.pandas.groupby.GroupBy.cumsum">[docs]</a> <span class="k">def</span> <span class="nf">cumsum</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Cumulative sum for each group.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.cumsum</span> |
| <span class="sd"> DataFrame.cumsum</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame(</span> |
| <span class="sd"> ... [[1, None, 4], [1, 0.1, 3], [1, 20.0, 2], [4, 10.0, 1]],</span> |
| <span class="sd"> ... columns=list('ABC'))</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 0 1 NaN 4</span> |
| <span class="sd"> 1 1 0.1 3</span> |
| <span class="sd"> 2 1 20.0 2</span> |
| <span class="sd"> 3 4 10.0 1</span> |
| |
| <span class="sd"> By default, iterates over rows and finds the sum in each column.</span> |
| |
| <span class="sd"> >>> df.groupby("A").cumsum().sort_index()</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> 0 NaN 4</span> |
| <span class="sd"> 1 0.1 7</span> |
| <span class="sd"> 2 20.1 9</span> |
| <span class="sd"> 3 10.0 1</span> |
| |
| <span class="sd"> It works as below in Series.</span> |
| |
| <span class="sd"> >>> df.B.groupby(df.A).cumsum().sort_index()</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> 1 0.1</span> |
| <span class="sd"> 2 20.1</span> |
| <span class="sd"> 3 10.0</span> |
| <span class="sd"> Name: B, dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_cumsum</span><span class="p">(</span><span class="kc">True</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span> |
| <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.apply"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.apply.html#pyspark.pandas.groupby.GroupBy.apply">[docs]</a> <span class="k">def</span> <span class="nf">apply</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">Series</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Apply function `func` group-wise and combine the results together.</span> |
| |
| <span class="sd"> The function passed to `apply` must take a DataFrame as its first</span> |
| <span class="sd"> argument and return a DataFrame. `apply` will</span> |
| <span class="sd"> then take care of combining the results back together into a single</span> |
| <span class="sd"> dataframe. `apply` is therefore a highly flexible</span> |
| <span class="sd"> grouping method.</span> |
| |
| <span class="sd"> While `apply` is a very flexible method, its downside is that</span> |
| <span class="sd"> using it can be quite a bit slower than using more specific methods</span> |
| <span class="sd"> like `agg` or `transform`. pandas-on-Spark offers a wide range of method that will</span> |
| <span class="sd"> be much faster than using `apply` for their specific purposes, so try to</span> |
| <span class="sd"> use them before reaching for `apply`.</span> |
| |
| <span class="sd"> .. note:: this API executes the function once to infer the type which is</span> |
| <span class="sd"> potentially expensive, for instance, when the dataset is created after</span> |
| <span class="sd"> aggregations or sorting.</span> |
| |
| <span class="sd"> To avoid this, specify return type in ``func``, for instance, as below:</span> |
| |
| <span class="sd"> >>> def pandas_div(x) -> ps.DataFrame[int, [float, float]]:</span> |
| <span class="sd"> ... return x[['B', 'C']] / x[['B', 'C']]</span> |
| |
| <span class="sd"> If the return type is specified, the output column names become</span> |
| <span class="sd"> `c0, c1, c2 ... cn`. These names are positionally mapped to the returned</span> |
| <span class="sd"> DataFrame in ``func``.</span> |
| |
| <span class="sd"> To specify the column names, you can assign them in a NumPy compound type style</span> |
| <span class="sd"> as below:</span> |
| |
| <span class="sd"> >>> def pandas_div(x) -> ps.DataFrame[("index", int), [("a", float), ("b", float)]]:</span> |
| <span class="sd"> ... return x[['B', 'C']] / x[['B', 'C']]</span> |
| |
| <span class="sd"> >>> pdf = pd.DataFrame({'B': [1.], 'C': [3.]})</span> |
| <span class="sd"> >>> def plus_one(x) -> ps.DataFrame[</span> |
| <span class="sd"> ... (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]:</span> |
| <span class="sd"> ... return x[['B', 'C']] / x[['B', 'C']]</span> |
| |
| <span class="sd"> .. note:: the dataframe within ``func`` is actually a pandas dataframe. Therefore,</span> |
| <span class="sd"> any pandas API within this function is allowed.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> func : callable</span> |
| <span class="sd"> A callable that takes a DataFrame as its first argument, and</span> |
| <span class="sd"> returns a dataframe.</span> |
| <span class="sd"> *args</span> |
| <span class="sd"> Positional arguments to pass to func.</span> |
| <span class="sd"> **kwargs</span> |
| <span class="sd"> Keyword arguments to pass to func.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> applied : DataFrame or Series</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> aggregate : Apply aggregate function to the GroupBy object.</span> |
| <span class="sd"> DataFrame.apply : Apply a function to a DataFrame.</span> |
| <span class="sd"> Series.apply : Apply a function to a Series.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': 'a a b'.split(),</span> |
| <span class="sd"> ... 'B': [1, 2, 3],</span> |
| <span class="sd"> ... 'C': [4, 6, 5]}, columns=['A', 'B', 'C'])</span> |
| <span class="sd"> >>> g = df.groupby('A')</span> |
| |
| <span class="sd"> Notice that ``g`` has two groups, ``a`` and ``b``.</span> |
| <span class="sd"> Calling `apply` in various ways, we can get different grouping results:</span> |
| |
| <span class="sd"> Below the functions passed to `apply` takes a DataFrame as</span> |
| <span class="sd"> its argument and returns a DataFrame. `apply` combines the result for</span> |
| <span class="sd"> each group together into a new DataFrame:</span> |
| |
| <span class="sd"> >>> def plus_min(x):</span> |
| <span class="sd"> ... return x + x.min()</span> |
| <span class="sd"> >>> g.apply(plus_min).sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 0 aa 2 8</span> |
| <span class="sd"> 1 aa 3 10</span> |
| <span class="sd"> 2 bb 6 10</span> |
| |
| <span class="sd"> >>> g.apply(sum).sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> A</span> |
| <span class="sd"> a aa 3 10</span> |
| <span class="sd"> b b 3 5</span> |
| |
| <span class="sd"> >>> g.apply(len).sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> A</span> |
| <span class="sd"> a 2</span> |
| <span class="sd"> b 1</span> |
| <span class="sd"> dtype: int64</span> |
| |
| <span class="sd"> You can specify the type hint and prevent schema inference for better performance.</span> |
| |
| <span class="sd"> >>> def pandas_div(x) -> ps.DataFrame[int, [float, float]]:</span> |
| <span class="sd"> ... return x[['B', 'C']] / x[['B', 'C']]</span> |
| <span class="sd"> >>> g.apply(pandas_div).sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> c0 c1</span> |
| <span class="sd"> 0 1.0 1.0</span> |
| <span class="sd"> 1 1.0 1.0</span> |
| <span class="sd"> 2 1.0 1.0</span> |
| |
| <span class="sd"> >>> def pandas_div(x) -> ps.DataFrame[("index", int), [("f1", float), ("f2", float)]]:</span> |
| <span class="sd"> ... return x[['B', 'C']] / x[['B', 'C']]</span> |
| <span class="sd"> >>> g.apply(pandas_div).sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> f1 f2</span> |
| <span class="sd"> index</span> |
| <span class="sd"> 0 1.0 1.0</span> |
| <span class="sd"> 1 1.0 1.0</span> |
| <span class="sd"> 2 1.0 1.0</span> |
| |
| <span class="sd"> In case of Series, it works as below.</span> |
| |
| <span class="sd"> >>> def plus_max(x) -> ps.Series[np.int]:</span> |
| <span class="sd"> ... return x + x.max()</span> |
| <span class="sd"> >>> df.B.groupby(df.A).apply(plus_max).sort_index() # doctest: +SKIP</span> |
| <span class="sd"> 0 6</span> |
| <span class="sd"> 1 3</span> |
| <span class="sd"> 2 4</span> |
| <span class="sd"> Name: B, dtype: int64</span> |
| |
| <span class="sd"> >>> def plus_min(x):</span> |
| <span class="sd"> ... return x + x.min()</span> |
| <span class="sd"> >>> df.B.groupby(df.A).apply(plus_min).sort_index()</span> |
| <span class="sd"> 0 2</span> |
| <span class="sd"> 1 3</span> |
| <span class="sd"> 2 6</span> |
| <span class="sd"> Name: B, dtype: int64</span> |
| |
| <span class="sd"> You can also return a scalar value as a aggregated value of the group:</span> |
| |
| <span class="sd"> >>> def plus_length(x) -> np.int:</span> |
| <span class="sd"> ... return len(x)</span> |
| <span class="sd"> >>> df.B.groupby(df.A).apply(plus_length).sort_index() # doctest: +SKIP</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> Name: B, dtype: int64</span> |
| |
| <span class="sd"> The extra arguments to the function can be passed as below.</span> |
| |
| <span class="sd"> >>> def calculation(x, y, z) -> np.int:</span> |
| <span class="sd"> ... return len(x) + y * z</span> |
| <span class="sd"> >>> df.B.groupby(df.A).apply(calculation, 5, z=10).sort_index() # doctest: +SKIP</span> |
| <span class="sd"> 0 51</span> |
| <span class="sd"> 1 52</span> |
| <span class="sd"> Name: B, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">callable</span><span class="p">(</span><span class="n">func</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"</span><span class="si">%s</span><span class="s2"> object is not callable"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| |
| <span class="n">spec</span> <span class="o">=</span> <span class="n">inspect</span><span class="o">.</span><span class="n">getfullargspec</span><span class="p">(</span><span class="n">func</span><span class="p">)</span> |
| <span class="n">return_sig</span> <span class="o">=</span> <span class="n">spec</span><span class="o">.</span><span class="n">annotations</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"return"</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span> |
| <span class="n">should_infer_schema</span> <span class="o">=</span> <span class="n">return_sig</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="n">should_retain_index</span> <span class="o">=</span> <span class="n">should_infer_schema</span> |
| |
| <span class="n">is_series_groupby</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">SeriesGroupBy</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span> |
| |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span><span class="p">:</span> |
| <span class="n">agg_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_column_labels_to_exclude</span> |
| <span class="p">]</span> |
| |
| <span class="n">psdf</span><span class="p">,</span> <span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">groupkey_names</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_prepare_group_map_apply</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">agg_columns</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span> |
| <span class="n">name</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]</span> |
| <span class="n">pandas_apply</span> <span class="o">=</span> <span class="n">_builtin_table</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">func</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">f</span> <span class="o">=</span> <span class="n">_builtin_table</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">func</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">pandas_apply</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">,</span> <span class="o">*</span><span class="n">a</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">k</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Any</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">f</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">),</span> <span class="o">*</span><span class="n">a</span><span class="p">,</span> <span class="o">**</span><span class="n">k</span><span class="p">)</span> |
| |
| <span class="n">should_return_series</span> <span class="o">=</span> <span class="kc">False</span> |
| |
| <span class="k">if</span> <span class="n">should_infer_schema</span><span class="p">:</span> |
| <span class="c1"># Here we execute with the first 1000 to get the return type.</span> |
| <span class="n">log_advice</span><span class="p">(</span> |
| <span class="s2">"If the type hints is not specified for `grouby.apply`, "</span> |
| <span class="s2">"it is expensive to infer the data type internally."</span> |
| <span class="p">)</span> |
| <span class="n">limit</span> <span class="o">=</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">"compute.shortcut_limit"</span><span class="p">)</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">limit</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span> |
| <span class="n">groupkeys</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">pdf</span><span class="p">[</span><span class="n">groupkey_name</span><span class="p">]</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">groupkey_name</span><span class="p">,</span> <span class="n">psser</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="n">grouped</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span> |
| <span class="n">pser_or_pdf</span> <span class="o">=</span> <span class="n">grouped</span><span class="p">[</span><span class="n">name</span><span class="p">]</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">pandas_apply</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">pser_or_pdf</span> <span class="o">=</span> <span class="n">grouped</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">pandas_apply</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| <span class="n">psser_or_psdf</span> <span class="o">=</span> <span class="n">ps</span><span class="o">.</span><span class="n">from_pandas</span><span class="p">(</span><span class="n">pser_or_pdf</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="o"><=</span> <span class="n">limit</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser_or_psdf</span><span class="p">,</span> <span class="n">ps</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="ow">and</span> <span class="n">is_series_groupby</span><span class="p">:</span> |
| <span class="n">psser_or_psdf</span> <span class="o">=</span> <span class="n">psser_or_psdf</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">SeriesGroupBy</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">],</span> <span class="n">psser_or_psdf</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">grouped</span><span class="p">)</span> <span class="o"><=</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">with</span> <span class="n">warnings</span><span class="o">.</span><span class="n">catch_warnings</span><span class="p">():</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">simplefilter</span><span class="p">(</span><span class="s2">"always"</span><span class="p">)</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span> |
| <span class="s2">"The amount of data for return type inference might not be large enough. "</span> |
| <span class="s2">"Consider increasing an option `compute.shortcut_limit`."</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser_or_psdf</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="n">should_return_series</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="n">psdf_from_pandas</span> <span class="o">=</span> <span class="n">psser_or_psdf</span><span class="o">.</span><span class="n">_psdf</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">psdf_from_pandas</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">psser_or_psdf</span><span class="p">)</span> |
| |
| <span class="n">index_fields</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">field</span><span class="o">.</span><span class="n">normalize_spark_type</span><span class="p">()</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span> |
| <span class="p">]</span> |
| <span class="n">data_fields</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">field</span><span class="o">.</span><span class="n">normalize_spark_type</span><span class="p">()</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span> |
| <span class="p">]</span> |
| <span class="n">return_schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span><span class="n">field</span><span class="o">.</span><span class="n">struct_field</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">index_fields</span> <span class="o">+</span> <span class="n">data_fields</span><span class="p">])</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">return_type</span> <span class="o">=</span> <span class="n">infer_return_type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_series_groupby</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">return_type</span><span class="p">,</span> <span class="n">SeriesType</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Series as a return type hint at frame groupby is not supported "</span> |
| <span class="s2">"currently; however got [</span><span class="si">%s</span><span class="s2">]. Use DataFrame type hint instead."</span> <span class="o">%</span> <span class="n">return_sig</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">return_type</span><span class="p">,</span> <span class="n">DataFrameType</span><span class="p">):</span> |
| <span class="n">data_fields</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">data_fields</span> |
| <span class="n">return_schema</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">spark_type</span> |
| <span class="n">index_fields</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">index_fields</span> |
| <span class="n">should_retain_index</span> <span class="o">=</span> <span class="nb">len</span><span class="p">(</span><span class="n">index_fields</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span> |
| <span class="n">psdf_from_pandas</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">should_return_series</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="n">dtype</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">SeriesType</span><span class="p">,</span> <span class="n">ScalarType</span><span class="p">],</span> <span class="n">return_type</span><span class="p">)</span><span class="o">.</span><span class="n">dtype</span> |
| <span class="n">spark_type</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">SeriesType</span><span class="p">,</span> <span class="n">ScalarType</span><span class="p">],</span> <span class="n">return_type</span><span class="p">)</span><span class="o">.</span><span class="n">spark_type</span> |
| <span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span> |
| <span class="n">data_fields</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">InternalField</span><span class="p">(</span> |
| <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">struct_field</span><span class="o">=</span><span class="n">StructField</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">,</span> <span class="n">dataType</span><span class="o">=</span><span class="n">spark_type</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">data_fields</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">InternalField</span><span class="p">(</span> |
| <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> |
| <span class="n">struct_field</span><span class="o">=</span><span class="n">StructField</span><span class="p">(</span> |
| <span class="n">name</span><span class="o">=</span><span class="n">SPARK_DEFAULT_SERIES_NAME</span><span class="p">,</span> <span class="n">dataType</span><span class="o">=</span><span class="n">spark_type</span> |
| <span class="p">),</span> |
| <span class="p">)</span> |
| <span class="p">]</span> |
| <span class="n">return_schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span><span class="n">field</span><span class="o">.</span><span class="n">struct_field</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">data_fields</span><span class="p">])</span> |
| |
| <span class="k">def</span> <span class="nf">pandas_groupby_apply</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> |
| |
| <span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span> |
| <span class="n">pdf_or_ser</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)[</span><span class="n">name</span><span class="p">]</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">pandas_apply</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">pdf_or_ser</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">apply</span><span class="p">(</span><span class="n">pandas_apply</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">should_return_series</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pdf_or_ser</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span> |
| <span class="n">pdf_or_ser</span> <span class="o">=</span> <span class="n">pdf_or_ser</span><span class="o">.</span><span class="n">stack</span><span class="p">()</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">pdf_or_ser</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">pdf_or_ser</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">pdf_or_ser</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_group_map_apply</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">,</span> |
| <span class="n">pandas_groupby_apply</span><span class="p">,</span> |
| <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">],</span> |
| <span class="n">return_schema</span><span class="p">,</span> |
| <span class="n">retain_index</span><span class="o">=</span><span class="n">should_retain_index</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">should_retain_index</span><span class="p">:</span> |
| <span class="c1"># If schema is inferred, we can restore indexes too.</span> |
| <span class="k">if</span> <span class="n">psdf_from_pandas</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_fields</span><span class="o">=</span><span class="n">index_fields</span><span class="p">,</span> <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">index_names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Optional</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="o">...</span><span class="p">]]]]</span> <span class="o">=</span> <span class="kc">None</span> |
| |
| <span class="n">index_spark_columns</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_field</span><span class="o">.</span><span class="n">struct_field</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">index_field</span> <span class="ow">in</span> <span class="n">index_fields</span> |
| <span class="p">]</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">any</span><span class="p">(</span> |
| <span class="p">[</span> |
| <span class="n">SPARK_INDEX_NAME_PATTERN</span><span class="o">.</span><span class="n">match</span><span class="p">(</span><span class="n">index_field</span><span class="o">.</span><span class="n">struct_field</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">index_field</span> <span class="ow">in</span> <span class="n">index_fields</span> |
| <span class="p">]</span> |
| <span class="p">):</span> |
| <span class="n">index_names</span> <span class="o">=</span> <span class="p">[(</span><span class="n">index_field</span><span class="o">.</span><span class="n">struct_field</span><span class="o">.</span><span class="n">name</span><span class="p">,)</span> <span class="k">for</span> <span class="n">index_field</span> <span class="ow">in</span> <span class="n">index_fields</span><span class="p">]</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="n">index_names</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="n">index_spark_columns</span><span class="p">,</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="n">index_fields</span><span class="p">,</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="c1"># Otherwise, it loses index.</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">should_return_series</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span> |
| <span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">cast</span><span class="p">(</span><span class="n">SeriesGroupBy</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">psser</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span></div> |
| |
| <span class="c1"># TODO: implement 'dropna' parameter</span> |
| <div class="viewcode-block" id="GroupBy.filter"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.filter.html#pyspark.pandas.groupby.GroupBy.filter">[docs]</a> <span class="k">def</span> <span class="nf">filter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">FrameLike</span><span class="p">],</span> <span class="n">FrameLike</span><span class="p">])</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return a copy of a DataFrame excluding elements from groups that</span> |
| <span class="sd"> do not satisfy the boolean criterion specified by func.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> f : function</span> |
| <span class="sd"> Function to apply to each subframe. Should return True or False.</span> |
| <span class="sd"> dropna : Drop groups that do not pass the filter. True by default;</span> |
| <span class="sd"> if False, groups that evaluate False are filled with NaNs.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> filtered : DataFrame or Series</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Each subframe is endowed the attribute 'name' in case you need to know</span> |
| <span class="sd"> which group you are working on.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',</span> |
| <span class="sd"> ... 'foo', 'bar'],</span> |
| <span class="sd"> ... 'B' : [1, 2, 3, 4, 5, 6],</span> |
| <span class="sd"> ... 'C' : [2.0, 5., 8., 1., 2., 9.]}, columns=['A', 'B', 'C'])</span> |
| <span class="sd"> >>> grouped = df.groupby('A')</span> |
| <span class="sd"> >>> grouped.filter(lambda x: x['B'].mean() > 3.)</span> |
| <span class="sd"> A B C</span> |
| <span class="sd"> 1 bar 2 5.0</span> |
| <span class="sd"> 3 bar 4 1.0</span> |
| <span class="sd"> 5 bar 6 9.0</span> |
| |
| <span class="sd"> >>> df.B.groupby(df.A).filter(lambda x: x.mean() > 3.)</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 3 4</span> |
| <span class="sd"> 5 6</span> |
| <span class="sd"> Name: B, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">callable</span><span class="p">(</span><span class="n">func</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"</span><span class="si">%s</span><span class="s2"> object is not callable"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| |
| <span class="n">is_series_groupby</span> <span class="o">=</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">SeriesGroupBy</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span> |
| |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span><span class="p">:</span> |
| <span class="n">agg_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_column_labels_to_exclude</span> |
| <span class="p">]</span> |
| |
| <span class="n">data_schema</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">psdf</span><span class="p">[</span><span class="n">agg_columns</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="o">*</span><span class="n">HIDDEN_COLUMNS</span><span class="p">)</span><span class="o">.</span><span class="n">schema</span> |
| <span class="p">)</span> |
| |
| <span class="n">psdf</span><span class="p">,</span> <span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">groupkey_names</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_prepare_group_map_apply</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">agg_columns</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span> |
| |
| <span class="k">def</span> <span class="nf">pandas_filter</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)[</span><span class="n">pdf</span><span class="o">.</span><span class="n">columns</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">]]</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">func</span><span class="p">))</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">f</span> <span class="o">=</span> <span class="n">_builtin_table</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="n">func</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">wrapped_func</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">f</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">))</span> |
| |
| <span class="k">def</span> <span class="nf">pandas_filter</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">wrapped_func</span><span class="p">)</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_group_map_apply</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">,</span> |
| <span class="n">pandas_filter</span><span class="p">,</span> |
| <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">],</span> |
| <span class="n">data_schema</span><span class="p">,</span> |
| <span class="n">retain_index</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">[</span><span class="n">agg_columns</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">sdf</span><span class="p">))</span> |
| <span class="k">if</span> <span class="n">is_series_groupby</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">FrameLike</span><span class="p">,</span> <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">FrameLike</span><span class="p">,</span> <span class="n">psdf</span><span class="p">)</span></div> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_prepare_group_map_apply</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">groupkeys</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span> <span class="n">agg_columns</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]:</span> |
| <span class="n">groupkey_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="s2">"__groupkey_</span><span class="si">{}</span><span class="s2">__"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">))</span> |
| <span class="p">]</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[[</span><span class="n">s</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_labels</span><span class="p">)]</span> <span class="o">+</span> <span class="n">agg_columns</span><span class="p">]</span> |
| <span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">label</span> <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span> <span class="k">else</span> <span class="n">label</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">]</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span><span class="p">),</span> <span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">groupkey_names</span> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_spark_group_map_apply</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> |
| <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span> |
| <span class="n">groupkeys_scols</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">],</span> |
| <span class="n">return_schema</span><span class="p">:</span> <span class="n">StructType</span><span class="p">,</span> |
| <span class="n">retain_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">SparkDataFrame</span><span class="p">:</span> |
| <span class="n">output_func</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_make_pandas_df_builder_func</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">func</span><span class="p">,</span> <span class="n">return_schema</span><span class="p">,</span> <span class="n">retain_index</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="o">*</span><span class="n">HIDDEN_COLUMNS</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkeys_scols</span><span class="p">)</span><span class="o">.</span><span class="n">applyInPandas</span><span class="p">(</span><span class="n">output_func</span><span class="p">,</span> <span class="n">return_schema</span><span class="p">)</span> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_make_pandas_df_builder_func</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> |
| <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span> |
| <span class="n">return_schema</span><span class="p">:</span> <span class="n">StructType</span><span class="p">,</span> |
| <span class="n">retain_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Callable</span><span class="p">[[</span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">],</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Creates a function that can be used inside the pandas UDF. This function can construct</span> |
| <span class="sd"> the same pandas DataFrame as if the pandas-on-Spark DataFrame is collected to driver side.</span> |
| <span class="sd"> The index, column labels, etc. are re-constructed within the function.</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.utils</span> <span class="kn">import</span> <span class="n">is_timestamp_ntz_preferred</span> |
| |
| <span class="n">arguments_for_restore_index</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">arguments_for_restore_index</span> |
| <span class="n">prefer_timestamp_ntz</span> <span class="o">=</span> <span class="n">is_timestamp_ntz_preferred</span><span class="p">()</span> |
| |
| <span class="k">def</span> <span class="nf">rename_output</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="o">.</span><span class="n">restore_index</span><span class="p">(</span><span class="n">pdf</span><span class="o">.</span><span class="n">copy</span><span class="p">(),</span> <span class="o">**</span><span class="n">arguments_for_restore_index</span><span class="p">)</span> |
| |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">func</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> |
| |
| <span class="c1"># If schema should be inferred, we don't restore index. pandas seems restoring</span> |
| <span class="c1"># the index in some cases.</span> |
| <span class="c1"># When Spark output type is specified, without executing it, we don't know</span> |
| <span class="c1"># if we should restore the index or not. For instance, see the example in</span> |
| <span class="c1"># https://github.com/pyspark.pandas/issues/628.</span> |
| <span class="n">pdf</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="o">.</span><span class="n">prepare_pandas_frame</span><span class="p">(</span> |
| <span class="n">pdf</span><span class="p">,</span> <span class="n">retain_index</span><span class="o">=</span><span class="n">retain_index</span><span class="p">,</span> <span class="n">prefer_timestamp_ntz</span><span class="o">=</span><span class="n">prefer_timestamp_ntz</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># Just positionally map the column names to given schema's.</span> |
| <span class="n">pdf</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">return_schema</span><span class="o">.</span><span class="n">names</span> |
| |
| <span class="k">return</span> <span class="n">pdf</span> |
| |
| <span class="k">return</span> <span class="n">rename_output</span> |
| |
| <div class="viewcode-block" id="GroupBy.rank"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.rank.html#pyspark.pandas.groupby.GroupBy.rank">[docs]</a> <span class="k">def</span> <span class="nf">rank</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">method</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"average"</span><span class="p">,</span> <span class="n">ascending</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Provide the rank of values within each group.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'</span> |
| <span class="sd"> * average: average rank of group</span> |
| <span class="sd"> * min: lowest rank in group</span> |
| <span class="sd"> * max: highest rank in group</span> |
| <span class="sd"> * first: ranks assigned in order they appear in the array</span> |
| <span class="sd"> * dense: like 'min', but rank always increases by 1 between groups</span> |
| <span class="sd"> ascending : boolean, default True</span> |
| <span class="sd"> False for ranks by high (1) to low (N)</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame with ranking of values within each group</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({</span> |
| <span class="sd"> ... 'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],</span> |
| <span class="sd"> ... 'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 1 1</span> |
| <span class="sd"> 1 1 2</span> |
| <span class="sd"> 2 1 2</span> |
| <span class="sd"> 3 2 2</span> |
| <span class="sd"> 4 2 3</span> |
| <span class="sd"> 5 2 3</span> |
| <span class="sd"> 6 3 3</span> |
| <span class="sd"> 7 3 4</span> |
| <span class="sd"> 8 3 4</span> |
| |
| <span class="sd"> >>> df.groupby("a").rank().sort_index()</span> |
| <span class="sd"> b</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 2.5</span> |
| <span class="sd"> 2 2.5</span> |
| <span class="sd"> 3 1.0</span> |
| <span class="sd"> 4 2.5</span> |
| <span class="sd"> 5 2.5</span> |
| <span class="sd"> 6 1.0</span> |
| <span class="sd"> 7 2.5</span> |
| <span class="sd"> 8 2.5</span> |
| |
| <span class="sd"> >>> df.b.groupby(df.a).rank(method='max').sort_index()</span> |
| <span class="sd"> 0 1.0</span> |
| <span class="sd"> 1 3.0</span> |
| <span class="sd"> 2 3.0</span> |
| <span class="sd"> 3 1.0</span> |
| <span class="sd"> 4 3.0</span> |
| <span class="sd"> 5 3.0</span> |
| <span class="sd"> 6 1.0</span> |
| <span class="sd"> 7 3.0</span> |
| <span class="sd"> 8 3.0</span> |
| <span class="sd"> Name: b, dtype: float64</span> |
| |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_rank</span><span class="p">(</span><span class="n">method</span><span class="p">,</span> <span class="n">ascending</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span> |
| <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <span class="c1"># TODO: add axis parameter</span> |
| <div class="viewcode-block" id="GroupBy.idxmax"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.idxmax.html#pyspark.pandas.groupby.GroupBy.idxmax">[docs]</a> <span class="k">def</span> <span class="nf">idxmax</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return index of first occurrence of maximum over requested axis in group.</span> |
| <span class="sd"> NA/null values are excluded.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> skipna : boolean, default True</span> |
| <span class="sd"> Exclude NA/null values. If an entire row/column is NA, the result will be NA.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.idxmax</span> |
| <span class="sd"> DataFrame.idxmax</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 1, 2, 2, 3],</span> |
| <span class="sd"> ... 'b': [1, 2, 3, 4, 5],</span> |
| <span class="sd"> ... 'c': [5, 4, 3, 2, 1]}, columns=['a', 'b', 'c'])</span> |
| |
| <span class="sd"> >>> df.groupby(['a'])['b'].idxmax().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1 1</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> 3 4</span> |
| <span class="sd"> Name: b, dtype: int64</span> |
| |
| <span class="sd"> >>> df.groupby(['a']).idxmax().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> b c</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1 1 0</span> |
| <span class="sd"> 2 3 2</span> |
| <span class="sd"> 3 4 4</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"idxmax only support one-level index now"</span><span class="p">)</span> |
| |
| <span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"__groupkey_</span><span class="si">{}</span><span class="s2">__"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">):</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span> |
| <span class="n">index</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| |
| <span class="n">stat_exprs</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">scol</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_scols</span><span class="p">):</span> |
| <span class="n">name</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| |
| <span class="k">if</span> <span class="n">skipna</span><span class="p">:</span> |
| <span class="n">order_column</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">desc_nulls_last</span><span class="p">()</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">order_column</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">desc_nulls_first</span><span class="p">()</span> |
| |
| <span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span> |
| <span class="n">order_column</span><span class="p">,</span> <span class="n">NATURAL_ORDER_COLUMN_NAME</span> |
| <span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span> |
| <span class="n">name</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">,</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index</span><span class="p">))</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">stat_exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">name</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">))</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">stat_exprs</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> |
| <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> |
| <span class="p">],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cleanup_and_return</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <span class="c1"># TODO: add axis parameter</span> |
| <div class="viewcode-block" id="GroupBy.idxmin"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.idxmin.html#pyspark.pandas.groupby.GroupBy.idxmin">[docs]</a> <span class="k">def</span> <span class="nf">idxmin</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">skipna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return index of first occurrence of minimum over requested axis in group.</span> |
| <span class="sd"> NA/null values are excluded.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> skipna : boolean, default True</span> |
| <span class="sd"> Exclude NA/null values. If an entire row/column is NA, the result will be NA.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.idxmin</span> |
| <span class="sd"> DataFrame.idxmin</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 1, 2, 2, 3],</span> |
| <span class="sd"> ... 'b': [1, 2, 3, 4, 5],</span> |
| <span class="sd"> ... 'c': [5, 4, 3, 2, 1]}, columns=['a', 'b', 'c'])</span> |
| |
| <span class="sd"> >>> df.groupby(['a'])['b'].idxmin().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1 0</span> |
| <span class="sd"> 2 2</span> |
| <span class="sd"> 3 4</span> |
| <span class="sd"> Name: b, dtype: int64</span> |
| |
| <span class="sd"> >>> df.groupby(['a']).idxmin().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> b c</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1 0 1</span> |
| <span class="sd"> 2 2 3</span> |
| <span class="sd"> 3 4 4</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">!=</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"idxmin only support one-level index now"</span><span class="p">)</span> |
| |
| <span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"__groupkey_</span><span class="si">{}</span><span class="s2">__"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">):</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span> |
| <span class="n">index</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| |
| <span class="n">stat_exprs</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">scol</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_scols</span><span class="p">):</span> |
| <span class="n">name</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| |
| <span class="k">if</span> <span class="n">skipna</span><span class="p">:</span> |
| <span class="n">order_column</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">asc_nulls_last</span><span class="p">()</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">order_column</span> <span class="o">=</span> <span class="n">scol</span><span class="o">.</span><span class="n">asc_nulls_first</span><span class="p">()</span> |
| |
| <span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span> |
| <span class="n">order_column</span><span class="p">,</span> <span class="n">NATURAL_ORDER_COLUMN_NAME</span> |
| <span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span> |
| <span class="n">name</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">,</span> <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index</span><span class="p">))</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="kc">None</span><span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">stat_exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">max</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">name</span><span class="p">))</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">))</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">stat_exprs</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> |
| <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> |
| <span class="p">],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cleanup_and_return</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.fillna"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.fillna.html#pyspark.pandas.groupby.GroupBy.fillna">[docs]</a> <span class="k">def</span> <span class="nf">fillna</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">method</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">axis</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Axis</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">inplace</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""Fill NA/NaN values in group.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> value : scalar, dict, Series</span> |
| <span class="sd"> Value to use to fill holes. alternately a dict/Series of values</span> |
| <span class="sd"> specifying which value to use for each column.</span> |
| <span class="sd"> DataFrame is not supported.</span> |
| <span class="sd"> method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None</span> |
| <span class="sd"> Method to use for filling holes in reindexed Series pad / ffill: propagate last valid</span> |
| <span class="sd"> observation forward to next valid backfill / bfill:</span> |
| <span class="sd"> use NEXT valid observation to fill gap</span> |
| <span class="sd"> axis : {0 or `index`}</span> |
| <span class="sd"> 1 and `columns` are not supported.</span> |
| <span class="sd"> inplace : boolean, default False</span> |
| <span class="sd"> Fill in place (do not create a new object)</span> |
| <span class="sd"> limit : int, default None</span> |
| <span class="sd"> If method is specified, this is the maximum number of consecutive NaN values to</span> |
| <span class="sd"> forward/backward fill. In other words, if there is a gap with more than this number of</span> |
| <span class="sd"> consecutive NaNs, it will only be partially filled. If method is not specified,</span> |
| <span class="sd"> this is the maximum number of entries along the entire axis where NaNs will be filled.</span> |
| <span class="sd"> Must be greater than 0 if not None</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> DataFrame with NA entries filled.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({</span> |
| <span class="sd"> ... 'A': [1, 1, 2, 2],</span> |
| <span class="sd"> ... 'B': [2, 4, None, 3],</span> |
| <span class="sd"> ... 'C': [None, None, None, 1],</span> |
| <span class="sd"> ... 'D': [0, 1, 5, 4]</span> |
| <span class="sd"> ... },</span> |
| <span class="sd"> ... columns=['A', 'B', 'C', 'D'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C D</span> |
| <span class="sd"> 0 1 2.0 NaN 0</span> |
| <span class="sd"> 1 1 4.0 NaN 1</span> |
| <span class="sd"> 2 2 NaN NaN 5</span> |
| <span class="sd"> 3 2 3.0 1.0 4</span> |
| |
| <span class="sd"> We can also propagate non-null values forward or backward in group.</span> |
| |
| <span class="sd"> >>> df.groupby(['A'])['B'].fillna(method='ffill').sort_index()</span> |
| <span class="sd"> 0 2.0</span> |
| <span class="sd"> 1 4.0</span> |
| <span class="sd"> 2 NaN</span> |
| <span class="sd"> 3 3.0</span> |
| <span class="sd"> Name: B, dtype: float64</span> |
| |
| <span class="sd"> >>> df.groupby(['A']).fillna(method='bfill').sort_index()</span> |
| <span class="sd"> B C D</span> |
| <span class="sd"> 0 2.0 NaN 0</span> |
| <span class="sd"> 1 4.0 NaN 1</span> |
| <span class="sd"> 2 3.0 1.0 5</span> |
| <span class="sd"> 3 3.0 1.0 4</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_fillna</span><span class="p">(</span> |
| <span class="n">value</span><span class="o">=</span><span class="n">value</span><span class="p">,</span> <span class="n">method</span><span class="o">=</span><span class="n">method</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="n">axis</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span> |
| <span class="p">),</span> |
| <span class="n">should_resolve</span><span class="o">=</span><span class="p">(</span><span class="n">method</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">),</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.bfill"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.bfill.html#pyspark.pandas.groupby.GroupBy.bfill">[docs]</a> <span class="k">def</span> <span class="nf">bfill</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Synonym for `DataFrame.fillna()` with ``method=`bfill```.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {0 or `index`}</span> |
| <span class="sd"> 1 and `columns` are not supported.</span> |
| <span class="sd"> inplace : boolean, default False</span> |
| <span class="sd"> Fill in place (do not create a new object)</span> |
| <span class="sd"> limit : int, default None</span> |
| <span class="sd"> If method is specified, this is the maximum number of consecutive NaN values to</span> |
| <span class="sd"> forward/backward fill. In other words, if there is a gap with more than this number of</span> |
| <span class="sd"> consecutive NaNs, it will only be partially filled. If method is not specified,</span> |
| <span class="sd"> this is the maximum number of entries along the entire axis where NaNs will be filled.</span> |
| <span class="sd"> Must be greater than 0 if not None</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> DataFrame with NA entries filled.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({</span> |
| <span class="sd"> ... 'A': [1, 1, 2, 2],</span> |
| <span class="sd"> ... 'B': [2, 4, None, 3],</span> |
| <span class="sd"> ... 'C': [None, None, None, 1],</span> |
| <span class="sd"> ... 'D': [0, 1, 5, 4]</span> |
| <span class="sd"> ... },</span> |
| <span class="sd"> ... columns=['A', 'B', 'C', 'D'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C D</span> |
| <span class="sd"> 0 1 2.0 NaN 0</span> |
| <span class="sd"> 1 1 4.0 NaN 1</span> |
| <span class="sd"> 2 2 NaN NaN 5</span> |
| <span class="sd"> 3 2 3.0 1.0 4</span> |
| |
| <span class="sd"> Propagate non-null values backward.</span> |
| |
| <span class="sd"> >>> df.groupby(['A']).bfill().sort_index()</span> |
| <span class="sd"> B C D</span> |
| <span class="sd"> 0 2.0 NaN 0</span> |
| <span class="sd"> 1 4.0 NaN 1</span> |
| <span class="sd"> 2 3.0 1.0 5</span> |
| <span class="sd"> 3 3.0 1.0 4</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="n">method</span><span class="o">=</span><span class="s2">"bfill"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span></div> |
| |
| <span class="n">backfill</span> <span class="o">=</span> <span class="n">bfill</span> |
| |
| <div class="viewcode-block" id="GroupBy.ffill"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.ffill.html#pyspark.pandas.groupby.GroupBy.ffill">[docs]</a> <span class="k">def</span> <span class="nf">ffill</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">limit</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Synonym for `DataFrame.fillna()` with ``method=`ffill```.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> axis : {0 or `index`}</span> |
| <span class="sd"> 1 and `columns` are not supported.</span> |
| <span class="sd"> inplace : boolean, default False</span> |
| <span class="sd"> Fill in place (do not create a new object)</span> |
| <span class="sd"> limit : int, default None</span> |
| <span class="sd"> If method is specified, this is the maximum number of consecutive NaN values to</span> |
| <span class="sd"> forward/backward fill. In other words, if there is a gap with more than this number of</span> |
| <span class="sd"> consecutive NaNs, it will only be partially filled. If method is not specified,</span> |
| <span class="sd"> this is the maximum number of entries along the entire axis where NaNs will be filled.</span> |
| <span class="sd"> Must be greater than 0 if not None</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> DataFrame with NA entries filled.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({</span> |
| <span class="sd"> ... 'A': [1, 1, 2, 2],</span> |
| <span class="sd"> ... 'B': [2, 4, None, 3],</span> |
| <span class="sd"> ... 'C': [None, None, None, 1],</span> |
| <span class="sd"> ... 'D': [0, 1, 5, 4]</span> |
| <span class="sd"> ... },</span> |
| <span class="sd"> ... columns=['A', 'B', 'C', 'D'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B C D</span> |
| <span class="sd"> 0 1 2.0 NaN 0</span> |
| <span class="sd"> 1 1 4.0 NaN 1</span> |
| <span class="sd"> 2 2 NaN NaN 5</span> |
| <span class="sd"> 3 2 3.0 1.0 4</span> |
| |
| <span class="sd"> Propagate non-null values forward.</span> |
| |
| <span class="sd"> >>> df.groupby(['A']).ffill().sort_index()</span> |
| <span class="sd"> B C D</span> |
| <span class="sd"> 0 2.0 NaN 0</span> |
| <span class="sd"> 1 4.0 NaN 1</span> |
| <span class="sd"> 2 NaN NaN 5</span> |
| <span class="sd"> 3 3.0 1.0 4</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">fillna</span><span class="p">(</span><span class="n">method</span><span class="o">=</span><span class="s2">"ffill"</span><span class="p">,</span> <span class="n">limit</span><span class="o">=</span><span class="n">limit</span><span class="p">)</span></div> |
| |
| <span class="n">pad</span> <span class="o">=</span> <span class="n">ffill</span> |
| |
| <span class="k">def</span> <span class="nf">_limit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">asc</span><span class="p">:</span> <span class="nb">bool</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Private function for tail and head.</span> |
| <span class="sd"> """</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span> |
| |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span><span class="p">:</span> |
| <span class="n">agg_columns</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_column_labels_to_exclude</span> |
| <span class="p">]</span> |
| |
| <span class="n">psdf</span><span class="p">,</span> <span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_prepare_group_map_apply</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">,</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> |
| <span class="n">agg_columns</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">groupkey_scols</span> <span class="o">=</span> <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">]</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="n">tmp_col</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">"__row_number__"</span><span class="p">)</span> |
| |
| <span class="c1"># This part is handled differently depending on whether it is a tail or a head.</span> |
| <span class="n">window</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_scols</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span><span class="o">.</span><span class="n">asc</span><span class="p">())</span> |
| <span class="k">if</span> <span class="n">asc</span> |
| <span class="k">else</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_scols</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span><span class="o">.</span><span class="n">desc</span><span class="p">()</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">tmp_col</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">tmp_col</span><span class="p">)</span> <span class="o"><=</span> <span class="n">n</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">tmp_col</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span><span class="n">sdf</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cleanup_and_return</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">))</span> |
| |
| <div class="viewcode-block" id="GroupBy.head"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.head.html#pyspark.pandas.groupby.GroupBy.head">[docs]</a> <span class="k">def</span> <span class="nf">head</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return first n rows of each group.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame or Series</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 1, 1, 1, 2, 2, 2, 3, 3, 3],</span> |
| <span class="sd"> ... 'b': [2, 3, 1, 4, 6, 9, 8, 10, 7, 5],</span> |
| <span class="sd"> ... 'c': [3, 5, 2, 5, 1, 2, 6, 4, 3, 6]},</span> |
| <span class="sd"> ... columns=['a', 'b', 'c'],</span> |
| <span class="sd"> ... index=[7, 2, 4, 1, 3, 4, 9, 10, 5, 6])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 7 1 2 3</span> |
| <span class="sd"> 2 1 3 5</span> |
| <span class="sd"> 4 1 1 2</span> |
| <span class="sd"> 1 1 4 5</span> |
| <span class="sd"> 3 2 6 1</span> |
| <span class="sd"> 4 2 9 2</span> |
| <span class="sd"> 9 2 8 6</span> |
| <span class="sd"> 10 3 10 4</span> |
| <span class="sd"> 5 3 7 3</span> |
| <span class="sd"> 6 3 5 6</span> |
| |
| <span class="sd"> >>> df.groupby('a').head(2).sort_index()</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 2 1 3 5</span> |
| <span class="sd"> 3 2 6 1</span> |
| <span class="sd"> 4 2 9 2</span> |
| <span class="sd"> 5 3 7 3</span> |
| <span class="sd"> 7 1 2 3</span> |
| <span class="sd"> 10 3 10 4</span> |
| |
| <span class="sd"> >>> df.groupby('a')['b'].head(2).sort_index()</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> 3 6</span> |
| <span class="sd"> 4 9</span> |
| <span class="sd"> 5 7</span> |
| <span class="sd"> 7 2</span> |
| <span class="sd"> 10 10</span> |
| <span class="sd"> Name: b, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_limit</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="n">asc</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.tail"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.tail.html#pyspark.pandas.groupby.GroupBy.tail">[docs]</a> <span class="k">def</span> <span class="nf">tail</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return last n rows of each group.</span> |
| |
| <span class="sd"> Similar to `.apply(lambda x: x.tail(n))`, but it returns a subset of rows from</span> |
| <span class="sd"> the original DataFrame with original index and order preserved (`as_index` flag is ignored).</span> |
| |
| <span class="sd"> Does not work for negative values of n.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame or Series</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 1, 1, 1, 2, 2, 2, 3, 3, 3],</span> |
| <span class="sd"> ... 'b': [2, 3, 1, 4, 6, 9, 8, 10, 7, 5],</span> |
| <span class="sd"> ... 'c': [3, 5, 2, 5, 1, 2, 6, 4, 3, 6]},</span> |
| <span class="sd"> ... columns=['a', 'b', 'c'],</span> |
| <span class="sd"> ... index=[7, 2, 3, 1, 3, 4, 9, 10, 5, 6])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 7 1 2 3</span> |
| <span class="sd"> 2 1 3 5</span> |
| <span class="sd"> 3 1 1 2</span> |
| <span class="sd"> 1 1 4 5</span> |
| <span class="sd"> 3 2 6 1</span> |
| <span class="sd"> 4 2 9 2</span> |
| <span class="sd"> 9 2 8 6</span> |
| <span class="sd"> 10 3 10 4</span> |
| <span class="sd"> 5 3 7 3</span> |
| <span class="sd"> 6 3 5 6</span> |
| |
| <span class="sd"> >>> df.groupby('a').tail(2).sort_index()</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 1 1 4 5</span> |
| <span class="sd"> 3 1 1 2</span> |
| <span class="sd"> 4 2 9 2</span> |
| <span class="sd"> 5 3 7 3</span> |
| <span class="sd"> 6 3 5 6</span> |
| <span class="sd"> 9 2 8 6</span> |
| |
| <span class="sd"> >>> df.groupby('a')['b'].tail(2).sort_index()</span> |
| <span class="sd"> 1 4</span> |
| <span class="sd"> 3 1</span> |
| <span class="sd"> 4 9</span> |
| <span class="sd"> 5 7</span> |
| <span class="sd"> 6 5</span> |
| <span class="sd"> 9 8</span> |
| <span class="sd"> Name: b, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_limit</span><span class="p">(</span><span class="n">n</span><span class="p">,</span> <span class="n">asc</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.shift"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.shift.html#pyspark.pandas.groupby.GroupBy.shift">[docs]</a> <span class="k">def</span> <span class="nf">shift</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> <span class="n">fill_value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Shift each group by periods observations.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> periods : integer, default 1</span> |
| <span class="sd"> number of periods to shift</span> |
| <span class="sd"> fill_value : optional</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| <span class="sd"> Object shifted within each group.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({</span> |
| <span class="sd"> ... 'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],</span> |
| <span class="sd"> ... 'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b</span> |
| <span class="sd"> 0 1 1</span> |
| <span class="sd"> 1 1 2</span> |
| <span class="sd"> 2 1 2</span> |
| <span class="sd"> 3 2 2</span> |
| <span class="sd"> 4 2 3</span> |
| <span class="sd"> 5 2 3</span> |
| <span class="sd"> 6 3 3</span> |
| <span class="sd"> 7 3 4</span> |
| <span class="sd"> 8 3 4</span> |
| |
| <span class="sd"> >>> df.groupby('a').shift().sort_index() # doctest: +SKIP</span> |
| <span class="sd"> b</span> |
| <span class="sd"> 0 NaN</span> |
| <span class="sd"> 1 1.0</span> |
| <span class="sd"> 2 2.0</span> |
| <span class="sd"> 3 NaN</span> |
| <span class="sd"> 4 2.0</span> |
| <span class="sd"> 5 3.0</span> |
| <span class="sd"> 6 NaN</span> |
| <span class="sd"> 7 3.0</span> |
| <span class="sd"> 8 4.0</span> |
| |
| <span class="sd"> >>> df.groupby('a').shift(periods=-1, fill_value=0).sort_index() # doctest: +SKIP</span> |
| <span class="sd"> b</span> |
| <span class="sd"> 0 2</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 0</span> |
| <span class="sd"> 3 3</span> |
| <span class="sd"> 4 3</span> |
| <span class="sd"> 5 0</span> |
| <span class="sd"> 6 4</span> |
| <span class="sd"> 7 4</span> |
| <span class="sd"> 8 0</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_apply_series_op</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">sg</span><span class="p">:</span> <span class="n">sg</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_shift</span><span class="p">(</span><span class="n">periods</span><span class="p">,</span> <span class="n">fill_value</span><span class="p">,</span> <span class="n">part_cols</span><span class="o">=</span><span class="n">sg</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">),</span> |
| <span class="n">should_resolve</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.transform"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.transform.html#pyspark.pandas.groupby.GroupBy.transform">[docs]</a> <span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">],</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Apply function column-by-column to the GroupBy object.</span> |
| |
| <span class="sd"> The function passed to `transform` must take a Series as its first</span> |
| <span class="sd"> argument and return a Series. The given function is executed for</span> |
| <span class="sd"> each series in each grouped data.</span> |
| |
| <span class="sd"> While `transform` is a very flexible method, its downside is that</span> |
| <span class="sd"> using it can be quite a bit slower than using more specific methods</span> |
| <span class="sd"> like `agg` or `transform`. pandas-on-Spark offers a wide range of method that will</span> |
| <span class="sd"> be much faster than using `transform` for their specific purposes, so try to</span> |
| <span class="sd"> use them before reaching for `transform`.</span> |
| |
| <span class="sd"> .. note:: this API executes the function once to infer the type which is</span> |
| <span class="sd"> potentially expensive, for instance, when the dataset is created after</span> |
| <span class="sd"> aggregations or sorting.</span> |
| |
| <span class="sd"> To avoid this, specify return type in ``func``, for instance, as below:</span> |
| |
| <span class="sd"> >>> def convert_to_string(x) -> ps.Series[str]:</span> |
| <span class="sd"> ... return x.apply("a string {}".format)</span> |
| |
| <span class="sd"> When the given function has the return type annotated, the original index of the</span> |
| <span class="sd"> GroupBy object will be lost and a default index will be attached to the result.</span> |
| <span class="sd"> Please be careful about configuring the default index. See also `Default Index Type</span> |
| <span class="sd"> <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_.</span> |
| |
| <span class="sd"> .. note:: the series within ``func`` is actually a pandas series. Therefore,</span> |
| <span class="sd"> any pandas API within this function is allowed.</span> |
| |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> func : callable</span> |
| <span class="sd"> A callable that takes a Series as its first argument, and</span> |
| <span class="sd"> returns a Series.</span> |
| <span class="sd"> *args</span> |
| <span class="sd"> Positional arguments to pass to func.</span> |
| <span class="sd"> **kwargs</span> |
| <span class="sd"> Keyword arguments to pass to func.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> applied : DataFrame</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> aggregate : Apply aggregate function to the GroupBy object.</span> |
| <span class="sd"> Series.apply : Apply a function to a Series.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'A': [0, 0, 1],</span> |
| <span class="sd"> ... 'B': [1, 2, 3],</span> |
| <span class="sd"> ... 'C': [4, 6, 5]}, columns=['A', 'B', 'C'])</span> |
| |
| <span class="sd"> >>> g = df.groupby('A')</span> |
| |
| <span class="sd"> Notice that ``g`` has two groups, ``0`` and ``1``.</span> |
| <span class="sd"> Calling `transform` in various ways, we can get different grouping results:</span> |
| <span class="sd"> Below the functions passed to `transform` takes a Series as</span> |
| <span class="sd"> its argument and returns a Series. `transform` applies the function on each series</span> |
| <span class="sd"> in each grouped data, and combine them into a new DataFrame:</span> |
| |
| <span class="sd"> >>> def convert_to_string(x) -> ps.Series[str]:</span> |
| <span class="sd"> ... return x.apply("a string {}".format)</span> |
| <span class="sd"> >>> g.transform(convert_to_string) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> 0 a string 1 a string 4</span> |
| <span class="sd"> 1 a string 2 a string 6</span> |
| <span class="sd"> 2 a string 3 a string 5</span> |
| |
| <span class="sd"> >>> def plus_max(x) -> ps.Series[np.int]:</span> |
| <span class="sd"> ... return x + x.max()</span> |
| <span class="sd"> >>> g.transform(plus_max) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> 0 3 10</span> |
| <span class="sd"> 1 4 12</span> |
| <span class="sd"> 2 6 10</span> |
| |
| <span class="sd"> You can omit the type hint and let pandas-on-Spark infer its type.</span> |
| |
| <span class="sd"> >>> def plus_min(x):</span> |
| <span class="sd"> ... return x + x.min()</span> |
| <span class="sd"> >>> g.transform(plus_min) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> 0 2 8</span> |
| <span class="sd"> 1 3 10</span> |
| <span class="sd"> 2 6 10</span> |
| |
| <span class="sd"> In case of Series, it works as below.</span> |
| |
| <span class="sd"> >>> df.B.groupby(df.A).transform(plus_max)</span> |
| <span class="sd"> 0 3</span> |
| <span class="sd"> 1 4</span> |
| <span class="sd"> 2 6</span> |
| <span class="sd"> Name: B, dtype: int64</span> |
| |
| <span class="sd"> >>> (df * -1).B.groupby(df.A).transform(abs)</span> |
| <span class="sd"> 0 1</span> |
| <span class="sd"> 1 2</span> |
| <span class="sd"> 2 3</span> |
| <span class="sd"> Name: B, dtype: int64</span> |
| |
| <span class="sd"> You can also specify extra arguments to pass to the function.</span> |
| |
| <span class="sd"> >>> def calculation(x, y, z) -> ps.Series[np.int]:</span> |
| <span class="sd"> ... return x + x.min() + y + z</span> |
| <span class="sd"> >>> g.transform(calculation, 5, z=20) # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> B C</span> |
| <span class="sd"> 0 27 33</span> |
| <span class="sd"> 1 28 35</span> |
| <span class="sd"> 2 31 35</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">callable</span><span class="p">(</span><span class="n">func</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"</span><span class="si">%s</span><span class="s2"> object is not callable"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">)</span> |
| |
| <span class="n">spec</span> <span class="o">=</span> <span class="n">inspect</span><span class="o">.</span><span class="n">getfullargspec</span><span class="p">(</span><span class="n">func</span><span class="p">)</span> |
| <span class="n">return_sig</span> <span class="o">=</span> <span class="n">spec</span><span class="o">.</span><span class="n">annotations</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"return"</span><span class="p">,</span> <span class="kc">None</span><span class="p">)</span> |
| |
| <span class="n">psdf</span><span class="p">,</span> <span class="n">groupkey_labels</span><span class="p">,</span> <span class="n">groupkey_names</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_prepare_group_map_apply</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">agg_columns</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">pandas_transform</span><span class="p">(</span><span class="n">pdf</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <span class="n">should_infer_schema</span> <span class="o">=</span> <span class="n">return_sig</span> <span class="ow">is</span> <span class="kc">None</span> |
| |
| <span class="k">if</span> <span class="n">should_infer_schema</span><span class="p">:</span> |
| <span class="c1"># Here we execute with the first 1000 to get the return type.</span> |
| <span class="c1"># If the records were less than 1000, it uses pandas API directly for a shortcut.</span> |
| <span class="n">log_advice</span><span class="p">(</span> |
| <span class="s2">"If the type hints is not specified for `grouby.transform`, "</span> |
| <span class="s2">"it is expensive to infer the data type internally."</span> |
| <span class="p">)</span> |
| <span class="n">limit</span> <span class="o">=</span> <span class="n">get_option</span><span class="p">(</span><span class="s2">"compute.shortcut_limit"</span><span class="p">)</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="n">limit</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">_to_internal_pandas</span><span class="p">()</span> |
| <span class="n">pdf</span> <span class="o">=</span> <span class="n">pdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">transform</span><span class="p">(</span><span class="n">func</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| <span class="n">psdf_from_pandas</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> |
| <span class="n">return_schema</span> <span class="o">=</span> <span class="n">force_decimal_precision_scale</span><span class="p">(</span> |
| <span class="n">as_nullable_spark_type</span><span class="p">(</span> |
| <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="o">*</span><span class="n">HIDDEN_COLUMNS</span><span class="p">)</span><span class="o">.</span><span class="n">schema</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">pdf</span><span class="p">)</span> <span class="o"><=</span> <span class="n">limit</span><span class="p">:</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cleanup_and_return</span><span class="p">(</span><span class="n">psdf_from_pandas</span><span class="p">)</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_group_map_apply</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">,</span> |
| <span class="n">pandas_transform</span><span class="p">,</span> |
| <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">],</span> |
| <span class="n">return_schema</span><span class="p">,</span> |
| <span class="n">retain_index</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="c1"># If schema is inferred, we can restore indexes too.</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span> |
| <span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">nullable</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span> |
| <span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">nullable</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">psdf_from_pandas</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span> |
| <span class="p">],</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">return_type</span> <span class="o">=</span> <span class="n">infer_return_type</span><span class="p">(</span><span class="n">func</span><span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">return_type</span><span class="p">,</span> <span class="n">SeriesType</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"Expected the return type of this function to be of Series type, "</span> |
| <span class="s2">"but found type </span><span class="si">{}</span><span class="s2">"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">return_type</span><span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="n">dtype</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">dtype</span> |
| <span class="n">spark_type</span> <span class="o">=</span> <span class="n">return_type</span><span class="o">.</span><span class="n">spark_type</span> |
| |
| <span class="n">data_fields</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">InternalField</span><span class="p">(</span><span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">struct_field</span><span class="o">=</span><span class="n">StructField</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">c</span><span class="p">,</span> <span class="n">dataType</span><span class="o">=</span><span class="n">spark_type</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span> |
| <span class="k">if</span> <span class="n">c</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">groupkey_names</span> |
| <span class="p">]</span> |
| |
| <span class="n">return_schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">([</span><span class="n">field</span><span class="o">.</span><span class="n">struct_field</span> <span class="k">for</span> <span class="n">field</span> <span class="ow">in</span> <span class="n">data_fields</span><span class="p">])</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_spark_group_map_apply</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">,</span> |
| <span class="n">pandas_transform</span><span class="p">,</span> |
| <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_column_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">groupkey_labels</span><span class="p">],</span> |
| <span class="n">return_schema</span><span class="p">,</span> |
| <span class="n">retain_index</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="c1"># Otherwise, it loses index.</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> <span class="n">index_spark_columns</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">data_fields</span><span class="o">=</span><span class="n">data_fields</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cleanup_and_return</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.nunique"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.nunique.html#pyspark.pandas.groupby.GroupBy.nunique">[docs]</a> <span class="k">def</span> <span class="nf">nunique</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return DataFrame with number of distinct observations per group for each column.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> dropna : boolean, default True</span> |
| <span class="sd"> Don’t include NaN in the counts.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> nunique : DataFrame or Series</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> df = ps.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',</span> |
| <span class="sd"> ... 'ham', 'ham'],</span> |
| <span class="sd"> ... 'value1': [1, 5, 5, 2, 5, 5],</span> |
| <span class="sd"> ... 'value2': list('abbaxy')}, columns=['id', 'value1', 'value2'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> id value1 value2</span> |
| <span class="sd"> 0 spam 1 a</span> |
| <span class="sd"> 1 egg 5 b</span> |
| <span class="sd"> 2 egg 5 b</span> |
| <span class="sd"> 3 spam 2 a</span> |
| <span class="sd"> 4 ham 5 x</span> |
| <span class="sd"> 5 ham 5 y</span> |
| |
| <span class="sd"> >>> df.groupby('id').nunique().sort_index() # doctest: +SKIP</span> |
| <span class="sd"> value1 value2</span> |
| <span class="sd"> id</span> |
| <span class="sd"> egg 1 1</span> |
| <span class="sd"> ham 1 2</span> |
| <span class="sd"> spam 2 1</span> |
| |
| <span class="sd"> >>> df.groupby('id')['value1'].nunique().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> id</span> |
| <span class="sd"> egg 1</span> |
| <span class="sd"> ham 1</span> |
| <span class="sd"> spam 2</span> |
| <span class="sd"> Name: value1, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">dropna</span><span class="p">:</span> |
| |
| <span class="k">def</span> <span class="nf">stat_function</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">countDistinct</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| |
| <span class="k">def</span> <span class="nf">stat_function</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">countDistinct</span><span class="p">(</span><span class="n">col</span><span class="p">)</span> <span class="o">+</span> <span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span> |
| <span class="n">F</span><span class="o">.</span><span class="n">count</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">when</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">isNull</span><span class="p">(),</span> <span class="mi">1</span><span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="kc">None</span><span class="p">))</span> <span class="o">>=</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">otherwise</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">stat_function</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">rolling</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">window</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="n">min_periods</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"RollingGroupby[FrameLike]"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return an rolling grouper, providing rolling</span> |
| <span class="sd"> functionality per group.</span> |
| |
| <span class="sd"> .. note:: 'min_periods' in pandas-on-Spark works as a fixed window size unlike pandas.</span> |
| <span class="sd"> Unlike pandas, NA is also counted as the period. This might be changed</span> |
| <span class="sd"> in the near future.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> window : int, or offset</span> |
| <span class="sd"> Size of the moving window.</span> |
| <span class="sd"> This is the number of observations used for calculating the statistic.</span> |
| <span class="sd"> Each window will be a fixed size.</span> |
| |
| <span class="sd"> min_periods : int, default 1</span> |
| <span class="sd"> Minimum number of observations in window required to have a value</span> |
| <span class="sd"> (otherwise result is NA).</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.groupby</span> |
| <span class="sd"> DataFrame.groupby</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">RollingGroupby</span> |
| |
| <span class="k">return</span> <span class="n">RollingGroupby</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">window</span><span class="p">,</span> <span class="n">min_periods</span><span class="o">=</span><span class="n">min_periods</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">expanding</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">min_periods</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"ExpandingGroupby[FrameLike]"</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return an expanding grouper, providing expanding</span> |
| <span class="sd"> functionality per group.</span> |
| |
| <span class="sd"> .. note:: 'min_periods' in pandas-on-Spark works as a fixed window size unlike pandas.</span> |
| <span class="sd"> Unlike pandas, NA is also counted as the period. This might be changed</span> |
| <span class="sd"> in the near future.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> min_periods : int, default 1</span> |
| <span class="sd"> Minimum number of observations in window required to have a value</span> |
| <span class="sd"> (otherwise result is NA).</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Series.groupby</span> |
| <span class="sd"> DataFrame.groupby</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.window</span> <span class="kn">import</span> <span class="n">ExpandingGroupby</span> |
| |
| <span class="k">return</span> <span class="n">ExpandingGroupby</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">min_periods</span><span class="o">=</span><span class="n">min_periods</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="GroupBy.get_group"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.get_group.html#pyspark.pandas.groupby.GroupBy.get_group">[docs]</a> <span class="k">def</span> <span class="nf">get_group</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Name</span><span class="p">]])</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Construct DataFrame from group with provided name.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> name : object</span> |
| <span class="sd"> The name of the group to get as a DataFrame.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> group : same type as obj</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psdf = ps.DataFrame([('falcon', 'bird', 389.0),</span> |
| <span class="sd"> ... ('parrot', 'bird', 24.0),</span> |
| <span class="sd"> ... ('lion', 'mammal', 80.5),</span> |
| <span class="sd"> ... ('monkey', 'mammal', np.nan)],</span> |
| <span class="sd"> ... columns=['name', 'class', 'max_speed'],</span> |
| <span class="sd"> ... index=[0, 2, 3, 1])</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> name class max_speed</span> |
| <span class="sd"> 0 falcon bird 389.0</span> |
| <span class="sd"> 2 parrot bird 24.0</span> |
| <span class="sd"> 3 lion mammal 80.5</span> |
| <span class="sd"> 1 monkey mammal NaN</span> |
| |
| <span class="sd"> >>> psdf.groupby("class").get_group("bird").sort_index()</span> |
| <span class="sd"> name class max_speed</span> |
| <span class="sd"> 0 falcon bird 389.0</span> |
| <span class="sd"> 2 parrot bird 24.0</span> |
| |
| <span class="sd"> >>> psdf.groupby("class").get_group("mammal").sort_index()</span> |
| <span class="sd"> name class max_speed</span> |
| <span class="sd"> 1 monkey mammal NaN</span> |
| <span class="sd"> 3 lion mammal 80.5</span> |
| <span class="sd"> """</span> |
| <span class="n">groupkeys</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_hashable</span><span class="p">(</span><span class="n">name</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"unhashable type: '</span><span class="si">{}</span><span class="s2">'"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="nb">type</span><span class="p">(</span><span class="n">name</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">))</span> |
| <span class="k">elif</span> <span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">)</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"must supply a tuple to get_group with multiple grouping keys"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">)</span> <span class="o">!=</span> <span class="nb">len</span><span class="p">(</span><span class="n">name</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"must supply a same-length tuple to get_group with multiple grouping keys"</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_list_like</span><span class="p">(</span><span class="n">name</span><span class="p">):</span> |
| <span class="n">name</span> <span class="o">=</span> <span class="p">[</span><span class="n">name</span><span class="p">]</span> |
| <span class="n">cond</span> <span class="o">=</span> <span class="n">SF</span><span class="o">.</span><span class="n">lit</span><span class="p">(</span><span class="kc">True</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">groupkey</span><span class="p">,</span> <span class="n">item</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">name</span><span class="p">):</span> |
| <span class="n">scol</span> <span class="o">=</span> <span class="n">groupkey</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> |
| <span class="n">cond</span> <span class="o">=</span> <span class="n">cond</span> <span class="o">&</span> <span class="p">(</span><span class="n">scol</span> <span class="o">==</span> <span class="n">item</span><span class="p">)</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_selected</span><span class="p">:</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span> |
| <span class="n">spark_frame</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_columns</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns_scols</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">spark_frame</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">spark_frame</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="p">],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">spark_frame</span><span class="p">,</span> <span class="n">s</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> |
| <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> |
| <span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">s</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_filter</span><span class="p">(</span><span class="n">cond</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">head</span><span class="p">()</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cleanup_and_return</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="GroupBy.median"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.GroupBy.median.html#pyspark.pandas.groupby.GroupBy.median">[docs]</a> <span class="k">def</span> <span class="nf">median</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">10000</span><span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute median of groups, excluding missing values.</span> |
| |
| <span class="sd"> For multiple groupings, the result index will be a MultiIndex</span> |
| |
| <span class="sd"> .. note:: Unlike pandas', the median in pandas-on-Spark is an approximated median based upon</span> |
| <span class="sd"> approximate percentile computation because computing median across a large dataset</span> |
| <span class="sd"> is extremely expensive.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> numeric_only : bool, default True</span> |
| <span class="sd"> Include only float, int, boolean columns. False is not supported. This parameter</span> |
| <span class="sd"> is mainly for pandas compatibility.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Series or DataFrame</span> |
| <span class="sd"> Median of values within each group.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> psdf = ps.DataFrame({'a': [1., 1., 1., 1., 2., 2., 2., 3., 3., 3.],</span> |
| <span class="sd"> ... 'b': [2., 3., 1., 4., 6., 9., 8., 10., 7., 5.],</span> |
| <span class="sd"> ... 'c': [3., 5., 2., 5., 1., 2., 6., 4., 3., 6.]},</span> |
| <span class="sd"> ... columns=['a', 'b', 'c'],</span> |
| <span class="sd"> ... index=[7, 2, 4, 1, 3, 4, 9, 10, 5, 6])</span> |
| <span class="sd"> >>> psdf</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 7 1.0 2.0 3.0</span> |
| <span class="sd"> 2 1.0 3.0 5.0</span> |
| <span class="sd"> 4 1.0 1.0 2.0</span> |
| <span class="sd"> 1 1.0 4.0 5.0</span> |
| <span class="sd"> 3 2.0 6.0 1.0</span> |
| <span class="sd"> 4 2.0 9.0 2.0</span> |
| <span class="sd"> 9 2.0 8.0 6.0</span> |
| <span class="sd"> 10 3.0 10.0 4.0</span> |
| <span class="sd"> 5 3.0 7.0 3.0</span> |
| <span class="sd"> 6 3.0 5.0 6.0</span> |
| |
| <span class="sd"> DataFrameGroupBy</span> |
| |
| <span class="sd"> >>> psdf.groupby('a').median().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> b c</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1.0 2.0 3.0</span> |
| <span class="sd"> 2.0 8.0 2.0</span> |
| <span class="sd"> 3.0 7.0 4.0</span> |
| |
| <span class="sd"> SeriesGroupBy</span> |
| |
| <span class="sd"> >>> psdf.groupby('a')['b'].median().sort_index()</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1.0 2.0</span> |
| <span class="sd"> 2.0 8.0</span> |
| <span class="sd"> 3.0 7.0</span> |
| <span class="sd"> Name: b, dtype: float64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">accuracy</span><span class="p">,</span> <span class="nb">int</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span> |
| <span class="s2">"accuracy must be an integer; however, got [</span><span class="si">%s</span><span class="s2">]"</span> <span class="o">%</span> <span class="nb">type</span><span class="p">(</span><span class="n">accuracy</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">stat_function</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">F</span><span class="o">.</span><span class="n">percentile_approx</span><span class="p">(</span><span class="n">col</span><span class="p">,</span> <span class="mf">0.5</span><span class="p">,</span> <span class="n">accuracy</span><span class="p">)</span> |
| |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">stat_function</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="n">numeric_only</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_reduce_for_stat_function</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">sfun</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Column</span><span class="p">],</span> <span class="n">Column</span><span class="p">],</span> <span class="n">only_numeric</span><span class="p">:</span> <span class="nb">bool</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">FrameLike</span><span class="p">:</span> |
| <span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span> |
| <span class="n">groupkey_scols</span> <span class="o">=</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)]</span> |
| |
| <span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">psser</span> |
| <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">)</span> <span class="ow">or</span> <span class="ow">not</span> <span class="n">only_numeric</span> |
| <span class="p">]</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="o">*</span><span class="n">groupkey_scols</span><span class="p">,</span> <span class="o">*</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">]</span> |
| <span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span> |
| <span class="p">],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">],</span> |
| <span class="n">column_label_names</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_label_names</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">stat_exprs</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="n">stat_exprs</span><span class="o">.</span><span class="n">append</span><span class="p">(</span> |
| <span class="n">sfun</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">_dtype_op</span><span class="o">.</span><span class="n">nan_to_null</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">)</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="o">*</span><span class="n">stat_exprs</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">select</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_names</span><span class="p">)</span><span class="o">.</span><span class="n">distinct</span><span class="p">()</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span><span class="p">:</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_sdf</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span> |
| <span class="n">subset</span><span class="o">=</span><span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_column_names</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span><span class="p">:</span> |
| <span class="n">should_drop_index</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span> |
| <span class="n">i</span> <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">gkey</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span> <span class="k">if</span> <span class="n">gkey</span><span class="o">.</span><span class="n">_psdf</span> <span class="ow">is</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">should_drop_index</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">level</span><span class="o">=</span><span class="n">should_drop_index</span><span class="p">,</span> <span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">should_drop_index</span><span class="p">)</span> <span class="o"><</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">):</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_cleanup_and_return</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_resolve_grouping_from_diff_dataframes</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">Label</span><span class="p">]]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span> <span class="n">Set</span><span class="p">[</span><span class="n">Label</span><span class="p">]]:</span> |
| <span class="n">column_labels_level</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels_level</span> |
| |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">additional_pssers</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">additional_column_labels</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">tmp_column_labels</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">col_or_s</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">by</span><span class="p">):</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">col_or_s</span><span class="o">.</span><span class="n">_psdf</span> <span class="ow">is</span> <span class="n">psdf</span><span class="p">:</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col_or_s</span><span class="o">.</span><span class="n">_column_label</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="n">psdf</span><span class="p">):</span> |
| <span class="n">temp_label</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="s2">"__tmp_groupkey_</span><span class="si">{}</span><span class="s2">__"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">temp_label</span><span class="p">)</span> |
| <span class="n">additional_pssers</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col_or_s</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">temp_label</span><span class="p">))</span> |
| <span class="n">additional_column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">temp_label</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">temp_label</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">,</span> |
| <span class="nb">tuple</span><span class="p">(</span> |
| <span class="p">([</span><span class="s2">""</span><span class="p">]</span> <span class="o">*</span> <span class="p">(</span><span class="n">column_labels_level</span> <span class="o">-</span> <span class="mi">1</span><span class="p">))</span> <span class="o">+</span> <span class="p">[</span><span class="s2">"__tmp_groupkey_</span><span class="si">{}</span><span class="s2">__"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">i</span><span class="p">)]</span> |
| <span class="p">),</span> |
| <span class="p">)</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">temp_label</span><span class="p">)</span> |
| <span class="n">tmp_column_labels</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="n">temp_label</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">col_or_s</span><span class="p">]</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">))</span> |
| <span class="n">column_labels</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span><span class="p">]</span> |
| <span class="o">+</span> <span class="n">additional_pssers</span> |
| <span class="p">)</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">assign_columns</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">this_column_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span> <span class="n">that_column_labels</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">Label</span><span class="p">]]:</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span> |
| <span class="s2">"Duplicated labels with groupby() and "</span> |
| <span class="s2">"'compute.ops_on_diff_frames' option are not supported currently "</span> |
| <span class="s2">"Please use unique labels in series and frames."</span> |
| <span class="p">)</span> |
| |
| <span class="k">for</span> <span class="n">col_or_s</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">by</span><span class="p">,</span> <span class="n">column_labels</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">tmp_column_labels</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">col_or_s</span> |
| <span class="n">psdf</span> <span class="o">=</span> <span class="n">align_diff_frames</span><span class="p">(</span> |
| <span class="n">assign_columns</span><span class="p">,</span> |
| <span class="n">psdf</span><span class="p">,</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">label</span><span class="p">),</span> |
| <span class="n">fillna</span><span class="o">=</span><span class="kc">False</span><span class="p">,</span> |
| <span class="n">how</span><span class="o">=</span><span class="s2">"inner"</span><span class="p">,</span> |
| <span class="n">preserve_order_column</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">tmp_column_labels</span> <span class="o">|=</span> <span class="nb">set</span><span class="p">(</span><span class="n">additional_column_labels</span><span class="p">)</span> |
| |
| <span class="n">new_by_series</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">col_or_s</span><span class="p">,</span> <span class="n">label</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">by</span><span class="p">,</span> <span class="n">column_labels</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">tmp_column_labels</span><span class="p">:</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">col_or_s</span> |
| <span class="n">new_by_series</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">name</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">new_by_series</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">label</span><span class="p">))</span> |
| |
| <span class="k">return</span> <span class="n">psdf</span><span class="p">,</span> <span class="n">new_by_series</span><span class="p">,</span> <span class="n">tmp_column_labels</span> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_resolve_grouping</span><span class="p">(</span><span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">Label</span><span class="p">]])</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">]:</span> |
| <span class="n">new_by_series</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">col_or_s</span> <span class="ow">in</span> <span class="n">by</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="n">new_by_series</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">)</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">):</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">psdf</span><span class="p">[</span><span class="n">col_or_s</span><span class="p">]</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">psser</span><span class="p">,</span> <span class="n">Series</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">))</span> |
| <span class="n">new_by_series</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">psser</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">new_by_series</span> |
| |
| |
| <span class="k">class</span> <span class="nc">DataFrameGroupBy</span><span class="p">(</span><span class="n">GroupBy</span><span class="p">[</span><span class="n">DataFrame</span><span class="p">]):</span> |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_build</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">Label</span><span class="p">]],</span> <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrameGroupBy"</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">any</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">col_or_s</span><span class="p">)</span> <span class="k">for</span> <span class="n">col_or_s</span> <span class="ow">in</span> <span class="n">by</span><span class="p">):</span> |
| <span class="p">(</span> |
| <span class="n">psdf</span><span class="p">,</span> |
| <span class="n">new_by_series</span><span class="p">,</span> |
| <span class="n">column_labels_to_exclude</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_resolve_grouping_from_diff_dataframes</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">by</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">new_by_series</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_resolve_grouping</span><span class="p">(</span><span class="n">psdf</span><span class="p">,</span> <span class="n">by</span><span class="p">)</span> |
| <span class="n">column_labels_to_exclude</span> <span class="o">=</span> <span class="nb">set</span><span class="p">()</span> |
| <span class="k">return</span> <span class="n">DataFrameGroupBy</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="p">,</span> |
| <span class="n">new_by_series</span><span class="p">,</span> |
| <span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span> |
| <span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">,</span> |
| <span class="n">column_labels_to_exclude</span><span class="o">=</span><span class="n">column_labels_to_exclude</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">,</span> |
| <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span> |
| <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> |
| <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> |
| <span class="n">column_labels_to_exclude</span><span class="p">:</span> <span class="n">Set</span><span class="p">[</span><span class="n">Label</span><span class="p">],</span> |
| <span class="n">agg_columns</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Label</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">):</span> |
| <span class="n">agg_columns_selected</span> <span class="o">=</span> <span class="n">agg_columns</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| <span class="k">if</span> <span class="n">agg_columns_selected</span><span class="p">:</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">column_labels_to_exclude</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">KeyError</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">agg_columns</span> <span class="o">=</span> <span class="p">[</span> |
| <span class="n">label</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">column_labels</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">any</span><span class="p">(</span><span class="n">label</span> <span class="o">==</span> <span class="n">key</span><span class="o">.</span><span class="n">_column_label</span> <span class="ow">and</span> <span class="n">key</span><span class="o">.</span><span class="n">_psdf</span> <span class="ow">is</span> <span class="n">psdf</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="n">by</span><span class="p">)</span> |
| <span class="ow">and</span> <span class="n">label</span> <span class="ow">not</span> <span class="ow">in</span> <span class="n">column_labels_to_exclude</span> |
| <span class="p">]</span> |
| |
| <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">=</span><span class="n">psdf</span><span class="p">,</span> |
| <span class="n">groupkeys</span><span class="o">=</span><span class="n">by</span><span class="p">,</span> |
| <span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span> |
| <span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">,</span> |
| <span class="n">column_labels_to_exclude</span><span class="o">=</span><span class="n">column_labels_to_exclude</span><span class="p">,</span> |
| <span class="n">agg_columns_selected</span><span class="o">=</span><span class="n">agg_columns_selected</span><span class="p">,</span> |
| <span class="n">agg_columns</span><span class="o">=</span><span class="p">[</span><span class="n">psdf</span><span class="p">[</span><span class="n">label</span><span class="p">]</span> <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">agg_columns</span><span class="p">],</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__getattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Any</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">MissingPandasLikeDataFrameGroupBy</span><span class="p">,</span> <span class="n">item</span><span class="p">):</span> |
| <span class="n">property_or_func</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">MissingPandasLikeDataFrameGroupBy</span><span class="p">,</span> <span class="n">item</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">property_or_func</span><span class="p">,</span> <span class="nb">property</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">property_or_func</span><span class="o">.</span><span class="n">fget</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">partial</span><span class="p">(</span><span class="n">property_or_func</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="fm">__getitem__</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">GroupBy</span><span class="p">:</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span> <span class="ow">and</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">item</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">SeriesGroupBy</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_psser_for</span><span class="p">(</span><span class="n">item</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">item</span><span class="p">,)),</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> |
| <span class="n">dropna</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">item</span><span class="p">):</span> |
| <span class="n">item</span> <span class="o">=</span> <span class="p">[</span><span class="n">item</span><span class="p">]</span> |
| <span class="k">elif</span> <span class="n">is_name_like_value</span><span class="p">(</span><span class="n">item</span><span class="p">):</span> |
| <span class="n">item</span> <span class="o">=</span> <span class="p">[(</span><span class="n">item</span><span class="p">,)]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">item</span> <span class="o">=</span> <span class="p">[</span><span class="n">i</span> <span class="k">if</span> <span class="n">is_name_like_tuple</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">else</span> <span class="p">(</span><span class="n">i</span><span class="p">,)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="n">item</span><span class="p">]</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span><span class="p">:</span> |
| <span class="n">groupkey_names</span> <span class="o">=</span> <span class="nb">set</span><span class="p">(</span><span class="n">key</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">key</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">item</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span> |
| <span class="s2">"cannot insert </span><span class="si">{}</span><span class="s2">, already exists"</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">(</span><span class="n">name</span><span class="p">))</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrameGroupBy</span><span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> |
| <span class="n">as_index</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_as_index</span><span class="p">,</span> |
| <span class="n">dropna</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span><span class="p">,</span> |
| <span class="n">column_labels_to_exclude</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_column_labels_to_exclude</span><span class="p">,</span> |
| <span class="n">agg_columns</span><span class="o">=</span><span class="n">item</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">_apply_series_op</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">op</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">"SeriesGroupBy"</span><span class="p">],</span> <span class="n">Series</span><span class="p">],</span> |
| <span class="n">should_resolve</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="n">applied</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="k">for</span> <span class="n">column</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">:</span> |
| <span class="n">applied</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">op</span><span class="p">(</span><span class="n">column</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span> |
| <span class="k">if</span> <span class="n">numeric_only</span><span class="p">:</span> |
| <span class="n">applied</span> <span class="o">=</span> <span class="p">[</span><span class="n">col</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">applied</span> <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">)]</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">applied</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="n">DataError</span><span class="p">(</span><span class="s2">"No numeric types to aggregate"</span><span class="p">)</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">with_new_columns</span><span class="p">(</span><span class="n">applied</span><span class="p">,</span> <span class="n">keep_order</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">should_resolve</span><span class="p">:</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">internal</span><span class="o">.</span><span class="n">resolved_copy</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">_cleanup_and_return</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psdf</span> |
| |
| <span class="c1"># TODO: Implement 'percentiles', 'include', and 'exclude' arguments.</span> |
| <span class="c1"># TODO: Add ``DataFrame.select_dtypes`` to See Also when 'include'</span> |
| <span class="c1"># and 'exclude' arguments are implemented.</span> |
| <div class="viewcode-block" id="DataFrameGroupBy.describe"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.DataFrameGroupBy.describe.html#pyspark.pandas.groupby.DataFrameGroupBy.describe">[docs]</a> <span class="k">def</span> <span class="nf">describe</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Generate descriptive statistics that summarize the central tendency,</span> |
| <span class="sd"> dispersion and shape of a dataset's distribution, excluding</span> |
| <span class="sd"> ``NaN`` values.</span> |
| |
| <span class="sd"> Analyzes both numeric and object series, as well</span> |
| <span class="sd"> as ``DataFrame`` column sets of mixed data types. The output</span> |
| <span class="sd"> will vary depending on what is provided. Refer to the notes</span> |
| <span class="sd"> below for more detail.</span> |
| |
| <span class="sd"> .. note:: Unlike pandas, the percentiles in pandas-on-Spark are based upon</span> |
| <span class="sd"> approximate percentile computation because computing percentiles</span> |
| <span class="sd"> across a large dataset is extremely expensive.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> DataFrame</span> |
| <span class="sd"> Summary statistics of the DataFrame provided.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.count</span> |
| <span class="sd"> DataFrame.max</span> |
| <span class="sd"> DataFrame.min</span> |
| <span class="sd"> DataFrame.mean</span> |
| <span class="sd"> DataFrame.std</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 1, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> a b c</span> |
| <span class="sd"> 0 1 4 7</span> |
| <span class="sd"> 1 1 5 8</span> |
| <span class="sd"> 2 3 6 9</span> |
| |
| <span class="sd"> Describing a ``DataFrame``. By default only numeric fields</span> |
| <span class="sd"> are returned.</span> |
| |
| <span class="sd"> >>> described = df.groupby('a').describe()</span> |
| <span class="sd"> >>> described.sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> b c</span> |
| <span class="sd"> count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1 2.0 4.5 0.707107 4.0 4.0 4.0 5.0 5.0 2.0 7.5 0.707107 7.0 7.0 7.0 8.0 8.0</span> |
| <span class="sd"> 3 1.0 6.0 NaN 6.0 6.0 6.0 6.0 6.0 1.0 9.0 NaN 9.0 9.0 9.0 9.0 9.0</span> |
| |
| <span class="sd"> """</span> |
| <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">col</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">StringType</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="ne">NotImplementedError</span><span class="p">(</span> |
| <span class="s2">"DataFrameGroupBy.describe() doesn't support for string type for now"</span> |
| <span class="p">)</span> |
| |
| <span class="n">psdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">aggregate</span><span class="p">([</span><span class="s2">"count"</span><span class="p">,</span> <span class="s2">"mean"</span><span class="p">,</span> <span class="s2">"std"</span><span class="p">,</span> <span class="s2">"min"</span><span class="p">,</span> <span class="s2">"quartiles"</span><span class="p">,</span> <span class="s2">"max"</span><span class="p">])</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| <span class="n">agg_column_labels</span> <span class="o">=</span> <span class="p">[</span><span class="n">col</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">]</span> |
| <span class="n">formatted_percentiles</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"25%"</span><span class="p">,</span> <span class="s2">"50%"</span><span class="p">,</span> <span class="s2">"75%"</span><span class="p">]</span> |
| |
| <span class="c1"># Split "quartiles" columns into first, second, and third quartiles.</span> |
| <span class="k">for</span> <span class="n">label</span> <span class="ow">in</span> <span class="n">agg_column_labels</span><span class="p">:</span> |
| <span class="n">quartiles_col</span> <span class="o">=</span> <span class="n">name_like_string</span><span class="p">(</span><span class="nb">tuple</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">+</span> <span class="p">[</span><span class="s2">"quartiles"</span><span class="p">]))</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">percentile</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">formatted_percentiles</span><span class="p">):</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span> |
| <span class="n">name_like_string</span><span class="p">(</span><span class="nb">tuple</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">+</span> <span class="p">[</span><span class="n">percentile</span><span class="p">])),</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">quartiles_col</span><span class="p">)[</span><span class="n">i</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">quartiles_col</span><span class="p">)</span> |
| |
| <span class="c1"># Reorder columns lexicographically by agg column followed by stats.</span> |
| <span class="n">stats</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"count"</span><span class="p">,</span> <span class="s2">"mean"</span><span class="p">,</span> <span class="s2">"std"</span><span class="p">,</span> <span class="s2">"min"</span><span class="p">]</span> <span class="o">+</span> <span class="n">formatted_percentiles</span> <span class="o">+</span> <span class="p">[</span><span class="s2">"max"</span><span class="p">]</span> |
| <span class="n">column_labels</span> <span class="o">=</span> <span class="p">[</span><span class="nb">tuple</span><span class="p">(</span><span class="nb">list</span><span class="p">(</span><span class="n">label</span><span class="p">)</span> <span class="o">+</span> <span class="p">[</span><span class="n">s</span><span class="p">])</span> <span class="k">for</span> <span class="n">label</span><span class="p">,</span> <span class="n">s</span> <span class="ow">in</span> <span class="n">product</span><span class="p">(</span><span class="n">agg_column_labels</span><span class="p">,</span> <span class="n">stats</span><span class="p">)]</span> |
| <span class="n">data_columns</span> <span class="o">=</span> <span class="nb">map</span><span class="p">(</span><span class="n">name_like_string</span><span class="p">,</span> <span class="n">column_labels</span><span class="p">)</span> |
| |
| <span class="c1"># Reindex the DataFrame to reflect initial grouping and agg columns.</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="n">column_labels</span><span class="p">,</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">data_columns</span><span class="p">],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># Cast columns to ``"float64"`` to match `pandas.DataFrame.groupby`.</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">)</span><span class="o">.</span><span class="n">astype</span><span class="p">(</span><span class="s2">"float64"</span><span class="p">)</span></div> |
| |
| |
| <span class="k">class</span> <span class="nc">SeriesGroupBy</span><span class="p">(</span><span class="n">GroupBy</span><span class="p">[</span><span class="n">Series</span><span class="p">]):</span> |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_build</span><span class="p">(</span> |
| <span class="n">psser</span><span class="p">:</span> <span class="n">Series</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Series</span><span class="p">,</span> <span class="n">Label</span><span class="p">]],</span> <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"SeriesGroupBy"</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">any</span><span class="p">(</span> |
| <span class="nb">isinstance</span><span class="p">(</span><span class="n">col_or_s</span><span class="p">,</span> <span class="n">Series</span><span class="p">)</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">same_anchor</span><span class="p">(</span><span class="n">psser</span><span class="p">,</span> <span class="n">col_or_s</span><span class="p">)</span> <span class="k">for</span> <span class="n">col_or_s</span> <span class="ow">in</span> <span class="n">by</span> |
| <span class="p">):</span> |
| <span class="n">psdf</span><span class="p">,</span> <span class="n">new_by_series</span><span class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_resolve_grouping_from_diff_dataframes</span><span class="p">(</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">to_frame</span><span class="p">(),</span> <span class="n">by</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">SeriesGroupBy</span><span class="p">(</span> |
| <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">name</span><span class="p">),</span> |
| <span class="n">new_by_series</span><span class="p">,</span> |
| <span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span> |
| <span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">new_by_series</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">_resolve_grouping</span><span class="p">(</span><span class="n">psser</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span> <span class="n">by</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">SeriesGroupBy</span><span class="p">(</span><span class="n">psser</span><span class="p">,</span> <span class="n">new_by_series</span><span class="p">,</span> <span class="n">as_index</span><span class="o">=</span><span class="n">as_index</span><span class="p">,</span> <span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">psser</span><span class="p">:</span> <span class="n">Series</span><span class="p">,</span> <span class="n">by</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Series</span><span class="p">],</span> <span class="n">as_index</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">):</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">as_index</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">TypeError</span><span class="p">(</span><span class="s2">"as_index=False only valid with DataFrame"</span><span class="p">)</span> |
| <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__init__</span><span class="p">(</span> |
| <span class="n">psdf</span><span class="o">=</span><span class="n">psser</span><span class="o">.</span><span class="n">_psdf</span><span class="p">,</span> |
| <span class="n">groupkeys</span><span class="o">=</span><span class="n">by</span><span class="p">,</span> |
| <span class="n">as_index</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="n">dropna</span><span class="o">=</span><span class="n">dropna</span><span class="p">,</span> |
| <span class="n">column_labels_to_exclude</span><span class="o">=</span><span class="nb">set</span><span class="p">(),</span> |
| <span class="n">agg_columns_selected</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> |
| <span class="n">agg_columns</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="p">],</span> |
| <span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span> <span class="o">=</span> <span class="n">psser</span> |
| |
| <span class="k">def</span> <span class="fm">__getattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Any</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="n">MissingPandasLikeSeriesGroupBy</span><span class="p">,</span> <span class="n">item</span><span class="p">):</span> |
| <span class="n">property_or_func</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="n">MissingPandasLikeSeriesGroupBy</span><span class="p">,</span> <span class="n">item</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">property_or_func</span><span class="p">,</span> <span class="nb">property</span><span class="p">):</span> |
| <span class="k">return</span> <span class="n">property_or_func</span><span class="o">.</span><span class="n">fget</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">partial</span><span class="p">(</span><span class="n">property_or_func</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span> |
| <span class="k">raise</span> <span class="ne">AttributeError</span><span class="p">(</span><span class="n">item</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">_apply_series_op</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">op</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="s2">"SeriesGroupBy"</span><span class="p">],</span> <span class="n">Series</span><span class="p">],</span> |
| <span class="n">should_resolve</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">numeric_only</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Series</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">numeric_only</span> <span class="ow">and</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">data_type</span><span class="p">,</span> <span class="n">NumericType</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="n">DataError</span><span class="p">(</span><span class="s2">"No numeric types to aggregate"</span><span class="p">)</span> |
| <span class="n">psser</span> <span class="o">=</span> <span class="n">op</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">should_resolve</span><span class="p">:</span> |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">resolved_copy</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">psser</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| |
| <span class="k">def</span> <span class="nf">_cleanup_and_return</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">psdf</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">)</span> <span class="o">-></span> <span class="n">Series</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">psdf</span><span class="p">)</span><span class="o">.</span><span class="n">rename</span><span class="p">()</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">agg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">MissingPandasLikeSeriesGroupBy</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">aggregate</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">MissingPandasLikeSeriesGroupBy</span><span class="o">.</span><span class="n">aggregate</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">args</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">size</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Series</span><span class="p">:</span> |
| <span class="k">return</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="n">size</span><span class="p">()</span><span class="o">.</span><span class="n">rename</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">name</span><span class="p">)</span> |
| |
| <span class="n">size</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">GroupBy</span><span class="o">.</span><span class="n">size</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <span class="c1"># TODO: add keep parameter</span> |
| <div class="viewcode-block" id="SeriesGroupBy.nsmallest"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.SeriesGroupBy.nsmallest.html#pyspark.pandas.groupby.SeriesGroupBy.nsmallest">[docs]</a> <span class="k">def</span> <span class="nf">nsmallest</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-></span> <span class="n">Series</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the smallest `n` elements.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> n : int</span> |
| <span class="sd"> Number of items to retrieve.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.nsmallest</span> |
| <span class="sd"> pyspark.pandas.DataFrame.nsmallest</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],</span> |
| <span class="sd"> ... 'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b'])</span> |
| |
| <span class="sd"> >>> df.groupby(['a'])['b'].nsmallest(1).sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1 0 1</span> |
| <span class="sd"> 2 3 2</span> |
| <span class="sd"> 3 6 3</span> |
| <span class="sd"> Name: b, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"nsmallest do not support multi-index now"</span><span class="p">)</span> |
| |
| <span class="n">groupkey_col_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">[</span><span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">scol</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">,</span> <span class="n">groupkey_col_names</span><span class="p">)],</span> |
| <span class="o">*</span><span class="p">[</span> |
| <span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">scol</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> |
| <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_col_names</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">.</span><span class="n">asc</span><span class="p">(),</span> |
| <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">temp_rank_column</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">"__rank__"</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">)</span> <span class="o"><=</span> <span class="n">n</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_col_names</span><span class="p">]</span> |
| <span class="o">+</span> <span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span> |
| <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="p">),</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">]</span> |
| <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span> |
| <span class="p">),</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">(</span> |
| <span class="p">[</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_col_names</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="o">+</span> <span class="p">[</span> |
| <span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">field</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="p">),</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_column_label</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <span class="c1"># TODO: add keep parameter</span> |
| <div class="viewcode-block" id="SeriesGroupBy.nlargest"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.SeriesGroupBy.nlargest.html#pyspark.pandas.groupby.SeriesGroupBy.nlargest">[docs]</a> <span class="k">def</span> <span class="nf">nlargest</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">5</span><span class="p">)</span> <span class="o">-></span> <span class="n">Series</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return the first n rows ordered by columns in descending order in group.</span> |
| |
| <span class="sd"> Return the first n rows with the smallest values in columns, in descending order.</span> |
| <span class="sd"> The columns that are not specified are returned as well, but not used for ordering.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> n : int</span> |
| <span class="sd"> Number of items to retrieve.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.nlargest</span> |
| <span class="sd"> pyspark.pandas.DataFrame.nlargest</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],</span> |
| <span class="sd"> ... 'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b'])</span> |
| |
| <span class="sd"> >>> df.groupby(['a'])['b'].nlargest(1).sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1 1 2</span> |
| <span class="sd"> 2 4 3</span> |
| <span class="sd"> 3 7 4</span> |
| <span class="sd"> Name: b, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span> <span class="o">></span> <span class="mi">1</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="ne">ValueError</span><span class="p">(</span><span class="s2">"nlargest do not support multi-index now"</span><span class="p">)</span> |
| |
| <span class="n">groupkey_col_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">))]</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span><span class="o">.</span><span class="n">select</span><span class="p">(</span> |
| <span class="o">*</span><span class="p">[</span><span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">scol</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys_scols</span><span class="p">,</span> <span class="n">groupkey_col_names</span><span class="p">)],</span> |
| <span class="o">*</span><span class="p">[</span> |
| <span class="n">scol</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">scol</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_spark_columns</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="p">,</span> |
| <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">window</span> <span class="o">=</span> <span class="n">Window</span><span class="o">.</span><span class="n">partitionBy</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_col_names</span><span class="p">)</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span><span class="o">.</span><span class="n">desc</span><span class="p">(),</span> |
| <span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="n">temp_rank_column</span> <span class="o">=</span> <span class="n">verify_temp_column_name</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="s2">"__rank__"</span><span class="p">)</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">sdf</span><span class="o">.</span><span class="n">withColumn</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">,</span> <span class="n">F</span><span class="o">.</span><span class="n">row_number</span><span class="p">()</span><span class="o">.</span><span class="n">over</span><span class="p">(</span><span class="n">window</span><span class="p">))</span> |
| <span class="o">.</span><span class="n">filter</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">col</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">)</span> <span class="o"><=</span> <span class="n">n</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">temp_rank_column</span><span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">drop</span><span class="p">(</span><span class="n">NATURAL_ORDER_COLUMN_NAME</span><span class="p">)</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_col_names</span><span class="p">]</span> |
| <span class="o">+</span> <span class="p">[</span> |
| <span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span> |
| <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_level</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="p">),</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">]</span> |
| <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_names</span> |
| <span class="p">),</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">(</span> |
| <span class="p">[</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">,</span> <span class="n">groupkey_col_names</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="o">+</span> <span class="p">[</span> |
| <span class="n">field</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span> <span class="o">+</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)))</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">field</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">index_fields</span><span class="p">)</span> |
| <span class="p">]</span> |
| <span class="p">),</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_column_label</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">])],</span> |
| <span class="n">data_fields</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <span class="c1"># TODO: add bins, normalize parameter</span> |
| <div class="viewcode-block" id="SeriesGroupBy.value_counts"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.SeriesGroupBy.value_counts.html#pyspark.pandas.groupby.SeriesGroupBy.value_counts">[docs]</a> <span class="k">def</span> <span class="nf">value_counts</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">sort</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">ascending</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">dropna</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Series</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Compute group sizes.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> sort : boolean, default None</span> |
| <span class="sd"> Sort by frequencies.</span> |
| <span class="sd"> ascending : boolean, default False</span> |
| <span class="sd"> Sort in ascending order.</span> |
| <span class="sd"> dropna : boolean, default True</span> |
| <span class="sd"> Don't include counts of NaN.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.groupby</span> |
| <span class="sd"> pyspark.pandas.DataFrame.groupby</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'A': [1, 2, 2, 3, 3, 3],</span> |
| <span class="sd"> ... 'B': [1, 1, 2, 3, 3, np.nan]},</span> |
| <span class="sd"> ... columns=['A', 'B'])</span> |
| <span class="sd"> >>> df</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 0 1 1.0</span> |
| <span class="sd"> 1 2 1.0</span> |
| <span class="sd"> 2 2 2.0</span> |
| <span class="sd"> 3 3 3.0</span> |
| <span class="sd"> 4 3 3.0</span> |
| <span class="sd"> 5 3 NaN</span> |
| |
| <span class="sd"> >>> df.groupby('A')['B'].value_counts().sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 1 1.0 1</span> |
| <span class="sd"> 2 1.0 1</span> |
| <span class="sd"> 2.0 1</span> |
| <span class="sd"> 3 3.0 2</span> |
| <span class="sd"> Name: B, dtype: int64</span> |
| |
| <span class="sd"> Don't include counts of NaN when dropna is False.</span> |
| |
| <span class="sd"> >>> df.groupby('A')['B'].value_counts(</span> |
| <span class="sd"> ... dropna=False).sort_index() # doctest: +NORMALIZE_WHITESPACE</span> |
| <span class="sd"> A B</span> |
| <span class="sd"> 1 1.0 1</span> |
| <span class="sd"> 2 1.0 1</span> |
| <span class="sd"> 2.0 1</span> |
| <span class="sd"> 3 3.0 2</span> |
| <span class="sd"> NaN 1</span> |
| <span class="sd"> Name: B, dtype: int64</span> |
| <span class="sd"> """</span> |
| <span class="n">groupkeys</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span> <span class="o">+</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span> |
| <span class="n">groupkey_names</span> <span class="o">=</span> <span class="p">[</span><span class="n">SPARK_INDEX_NAME_FORMAT</span><span class="p">(</span><span class="n">i</span><span class="p">)</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="nb">len</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">))]</span> |
| <span class="n">groupkey_cols</span> <span class="o">=</span> <span class="p">[</span><span class="n">s</span><span class="o">.</span><span class="n">spark</span><span class="o">.</span><span class="n">column</span><span class="o">.</span><span class="n">alias</span><span class="p">(</span><span class="n">name</span><span class="p">)</span> <span class="k">for</span> <span class="n">s</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)]</span> |
| |
| <span class="n">sdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_psdf</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">spark_frame</span> |
| |
| <span class="n">agg_column</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_spark_column_names</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="o">*</span><span class="n">groupkey_cols</span><span class="p">)</span><span class="o">.</span><span class="n">count</span><span class="p">()</span><span class="o">.</span><span class="n">withColumnRenamed</span><span class="p">(</span><span class="s2">"count"</span><span class="p">,</span> <span class="n">agg_column</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_dropna</span><span class="p">:</span> |
| <span class="n">_groupkey_column_names</span> <span class="o">=</span> <span class="n">groupkey_names</span><span class="p">[:</span> <span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)]</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span><span class="n">subset</span><span class="o">=</span><span class="n">_groupkey_column_names</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">dropna</span><span class="p">:</span> |
| <span class="n">_agg_columns_names</span> <span class="o">=</span> <span class="n">groupkey_names</span><span class="p">[</span><span class="nb">len</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_groupkeys</span><span class="p">)</span> <span class="p">:]</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">dropna</span><span class="p">(</span><span class="n">subset</span><span class="o">=</span><span class="n">_agg_columns_names</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">sort</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">ascending</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">agg_column</span><span class="p">)</span><span class="o">.</span><span class="n">asc</span><span class="p">())</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">sdf</span> <span class="o">=</span> <span class="n">sdf</span><span class="o">.</span><span class="n">orderBy</span><span class="p">(</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">agg_column</span><span class="p">)</span><span class="o">.</span><span class="n">desc</span><span class="p">())</span> |
| |
| <span class="n">internal</span> <span class="o">=</span> <span class="n">InternalFrame</span><span class="p">(</span> |
| <span class="n">spark_frame</span><span class="o">=</span><span class="n">sdf</span><span class="p">,</span> |
| <span class="n">index_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">col</span><span class="p">)</span> <span class="k">for</span> <span class="n">col</span> <span class="ow">in</span> <span class="n">groupkey_names</span><span class="p">],</span> |
| <span class="n">index_names</span><span class="o">=</span><span class="p">[</span><span class="n">psser</span><span class="o">.</span><span class="n">_column_label</span> <span class="k">for</span> <span class="n">psser</span> <span class="ow">in</span> <span class="n">groupkeys</span><span class="p">],</span> |
| <span class="n">index_fields</span><span class="o">=</span><span class="p">[</span> |
| <span class="n">psser</span><span class="o">.</span><span class="n">_internal</span><span class="o">.</span><span class="n">data_fields</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">copy</span><span class="p">(</span><span class="n">name</span><span class="o">=</span><span class="n">name</span><span class="p">)</span> |
| <span class="k">for</span> <span class="n">psser</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">zip</span><span class="p">(</span><span class="n">groupkeys</span><span class="p">,</span> <span class="n">groupkey_names</span><span class="p">)</span> |
| <span class="p">],</span> |
| <span class="n">column_labels</span><span class="o">=</span><span class="p">[</span><span class="bp">self</span><span class="o">.</span><span class="n">_agg_columns</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_column_label</span><span class="p">],</span> |
| <span class="n">data_spark_columns</span><span class="o">=</span><span class="p">[</span><span class="n">scol_for</span><span class="p">(</span><span class="n">sdf</span><span class="p">,</span> <span class="n">agg_column</span><span class="p">)],</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">first_series</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">internal</span><span class="p">))</span></div> |
| |
| <div class="viewcode-block" id="SeriesGroupBy.unique"><a class="viewcode-back" href="../../../reference/pyspark.pandas/api/pyspark.pandas.groupby.SeriesGroupBy.unique.html#pyspark.pandas.groupby.SeriesGroupBy.unique">[docs]</a> <span class="k">def</span> <span class="nf">unique</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Series</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Return unique values in group.</span> |
| |
| <span class="sd"> Uniques are returned in order of unknown. It does NOT sort.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.Series.unique</span> |
| <span class="sd"> pyspark.pandas.Index.unique</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = ps.DataFrame({'a': [1, 1, 1, 2, 2, 2, 3, 3, 3],</span> |
| <span class="sd"> ... 'b': [1, 2, 2, 2, 3, 3, 3, 4, 4]}, columns=['a', 'b'])</span> |
| |
| <span class="sd"> >>> df.groupby(['a'])['b'].unique().sort_index() # doctest: +SKIP</span> |
| <span class="sd"> a</span> |
| <span class="sd"> 1 [1, 2]</span> |
| <span class="sd"> 2 [2, 3]</span> |
| <span class="sd"> 3 [3, 4]</span> |
| <span class="sd"> Name: b, dtype: object</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_reduce_for_stat_function</span><span class="p">(</span><span class="n">F</span><span class="o">.</span><span class="n">collect_set</span><span class="p">,</span> <span class="n">only_numeric</span><span class="o">=</span><span class="kc">False</span><span class="p">)</span></div> |
| |
| |
| <span class="k">def</span> <span class="nf">is_multi_agg_with_relabel</span><span class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Check whether the kwargs pass to .agg look like multi-agg with relabling.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> **kwargs : dict</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> bool</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> is_multi_agg_with_relabel(a='max')</span> |
| <span class="sd"> False</span> |
| <span class="sd"> >>> is_multi_agg_with_relabel(a_max=('a', 'max'),</span> |
| <span class="sd"> ... a_min=('a', 'min'))</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> is_multi_agg_with_relabel()</span> |
| <span class="sd"> False</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">kwargs</span><span class="p">:</span> |
| <span class="k">return</span> <span class="kc">False</span> |
| <span class="k">return</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">v</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">v</span><span class="p">)</span> <span class="o">==</span> <span class="mi">2</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">kwargs</span><span class="o">.</span><span class="n">values</span><span class="p">())</span> |
| |
| |
| <span class="k">def</span> <span class="nf">normalize_keyword_aggregation</span><span class="p">(</span> |
| <span class="n">kwargs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="nb">str</span><span class="p">]],</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="n">Name</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="n">Tuple</span><span class="p">]]:</span> |
| <span class="sd">"""</span> |
| <span class="sd"> Normalize user-provided kwargs.</span> |
| |
| <span class="sd"> Transforms from the new ``Dict[str, NamedAgg]`` style kwargs</span> |
| <span class="sd"> to the old defaultdict[str, List[scalar]].</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> kwargs : dict</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> aggspec : dict</span> |
| <span class="sd"> The transformed kwargs.</span> |
| <span class="sd"> columns : List[str]</span> |
| <span class="sd"> The user-provided keys.</span> |
| <span class="sd"> order : List[Tuple[str, str]]</span> |
| <span class="sd"> Pairs of the input and output column names.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> normalize_keyword_aggregation({'output': ('input', 'sum')})</span> |
| <span class="sd"> (defaultdict(<class 'list'>, {'input': ['sum']}), ['output'], [('input', 'sum')])</span> |
| <span class="sd"> """</span> |
| <span class="n">aggspec</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="n">defaultdict</span><span class="p">(</span><span class="nb">list</span><span class="p">)</span> |
| <span class="n">order</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">Tuple</span><span class="p">]</span> <span class="o">=</span> <span class="p">[]</span> |
| <span class="n">columns</span><span class="p">,</span> <span class="n">pairs</span> <span class="o">=</span> <span class="nb">zip</span><span class="p">(</span><span class="o">*</span><span class="n">kwargs</span><span class="o">.</span><span class="n">items</span><span class="p">())</span> |
| |
| <span class="k">for</span> <span class="n">column</span><span class="p">,</span> <span class="n">aggfunc</span> <span class="ow">in</span> <span class="n">pairs</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">column</span> <span class="ow">in</span> <span class="n">aggspec</span><span class="p">:</span> |
| <span class="n">aggspec</span><span class="p">[</span><span class="n">column</span><span class="p">]</span><span class="o">.</span><span class="n">append</span><span class="p">(</span><span class="n">aggfunc</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">aggspec</span><span class="p">[</span><span class="n">column</span><span class="p">]</span> <span class="o">=</span> <span class="p">[</span><span class="n">aggfunc</span><span class="p">]</span> |
| |
| <span class="n">order</span><span class="o">.</span><span class="n">append</span><span class="p">((</span><span class="n">column</span><span class="p">,</span> <span class="n">aggfunc</span><span class="p">))</span> |
| <span class="c1"># For MultiIndex, we need to flatten the tuple, e.g. (('y', 'A'), 'max') needs to be</span> |
| <span class="c1"># flattened to ('y', 'A', 'max'), it won't do anything on normal Index.</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">order</span><span class="p">[</span><span class="mi">0</span><span class="p">][</span><span class="mi">0</span><span class="p">],</span> <span class="nb">tuple</span><span class="p">):</span> |
| <span class="n">order</span> <span class="o">=</span> <span class="p">[(</span><span class="o">*</span><span class="n">levs</span><span class="p">,</span> <span class="n">method</span><span class="p">)</span> <span class="k">for</span> <span class="n">levs</span><span class="p">,</span> <span class="n">method</span> <span class="ow">in</span> <span class="n">order</span><span class="p">]</span> |
| <span class="k">return</span> <span class="n">aggspec</span><span class="p">,</span> <span class="nb">list</span><span class="p">(</span><span class="n">columns</span><span class="p">),</span> <span class="n">order</span> |
| |
| |
| <span class="k">def</span> <span class="nf">_test</span><span class="p">()</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="kn">import</span> <span class="nn">os</span> |
| <span class="kn">import</span> <span class="nn">doctest</span> |
| <span class="kn">import</span> <span class="nn">sys</span> |
| <span class="kn">import</span> <span class="nn">numpy</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| <span class="kn">import</span> <span class="nn">pyspark.pandas.groupby</span> |
| |
| <span class="n">os</span><span class="o">.</span><span class="n">chdir</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">"SPARK_HOME"</span><span class="p">])</span> |
| |
| <span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">groupby</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"np"</span><span class="p">]</span> <span class="o">=</span> <span class="n">numpy</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"ps"</span><span class="p">]</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span> |
| <span class="n">spark</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">"local[4]"</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">"pyspark.pandas.groupby tests"</span><span class="p">)</span> |
| <span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| <span class="p">)</span> |
| <span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span> |
| <span class="n">pyspark</span><span class="o">.</span><span class="n">pandas</span><span class="o">.</span><span class="n">groupby</span><span class="p">,</span> |
| <span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span> |
| <span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span> |
| <span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> |
| |
| |
| <span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span> |
| <span class="n">_test</span><span class="p">()</span> |
| </pre></div> |
| |
| </div> |
| |
| |
| <div class='prev-next-bottom'> |
| |
| |
| </div> |
| |
| </main> |
| |
| |
| </div> |
| </div> |
| |
| |
| <script src="../../../_static/js/index.3da636dd464baa7582d2.js"></script> |
| |
| |
| <footer class="footer mt-5 mt-md-0"> |
| <div class="container"> |
| <p> |
| © Copyright .<br/> |
| Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/> |
| </p> |
| </div> |
| </footer> |
| </body> |
| </html> |