blob: 05865c8ff0781c19951cb224449b18abe2c8679d [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title>pyspark.sql.group &#8212; PySpark 3.5.5 documentation</title>
<link href="../../../_static/styles/theme.css?digest=1999514e3f237ded88cf" rel="stylesheet">
<link href="../../../_static/styles/pydata-sphinx-theme.css?digest=1999514e3f237ded88cf" rel="stylesheet">
<link rel="stylesheet"
href="../../../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../../../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet" href="../../../_static/styles/pydata-sphinx-theme.css" type="text/css" />
<link rel="stylesheet" href="../../../_static/pygments.css" type="text/css" />
<link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" />
<link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf">
<script id="documentation_options" data-url_root="../../../" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/jquery.js"></script>
<script src="../../../_static/underscore.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/language_data.js"></script>
<script src="../../../_static/clipboard.min.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/sql/group.html" />
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Google Analytics -->
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<div class="container-fluid" id="banner"></div>
<nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"><div class="container-xl">
<div id="navbar-start">
<a class="navbar-brand" href="../../../index.html">
<img src="../../../_static/spark-logo-reverse.png" class="logo" alt="logo">
</a>
</div>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-collapsible" aria-controls="navbar-collapsible" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar-collapsible" class="col-lg-9 collapse navbar-collapse">
<div id="navbar-center" class="mr-auto">
<div class="navbar-center-item">
<ul id="navbar-main-elements" class="navbar-nav">
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../index.html">
Overview
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../development/index.html">
Development
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</div>
</div>
<div id="navbar-end">
<div class="navbar-end-item">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
3.5.5
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/sql/group.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script>
</div>
</div>
</div>
</div>
</nav>
<div class="container-xl">
<div class="row">
<!-- Only show if we have sidebars configured, else just a small margin -->
<div class="col-12 col-md-3 bd-sidebar">
<div class="sidebar-start-items"><form class="bd-search d-flex align-items-center" action="../../../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form><nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
</div>
</nav>
</div>
<div class="sidebar-end-items">
</div>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<h1>Source code for pyspark.sql.group</h1><div class="highlight"><pre>
<span></span><span class="c1">#</span>
<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c1"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c1"># this work for additional information regarding copyright ownership.</span>
<span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c1"># (the &quot;License&quot;); you may not use this file except in compliance with</span>
<span class="c1"># the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c1"># distributed under the License is distributed on an &quot;AS IS&quot; BASIS,</span>
<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="c1">#</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">sys</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">typing</span><span class="w"> </span><span class="kn">import</span> <span class="n">Callable</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Optional</span><span class="p">,</span> <span class="n">TYPE_CHECKING</span><span class="p">,</span> <span class="n">overload</span><span class="p">,</span> <span class="n">Dict</span><span class="p">,</span> <span class="n">Union</span><span class="p">,</span> <span class="n">cast</span><span class="p">,</span> <span class="n">Tuple</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">py4j.java_gateway</span><span class="w"> </span><span class="kn">import</span> <span class="n">JavaObject</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.column</span><span class="w"> </span><span class="kn">import</span> <span class="n">Column</span><span class="p">,</span> <span class="n">_to_seq</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.session</span><span class="w"> </span><span class="kn">import</span> <span class="n">SparkSession</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.dataframe</span><span class="w"> </span><span class="kn">import</span> <span class="n">DataFrame</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql.pandas.group_ops</span><span class="w"> </span><span class="kn">import</span> <span class="n">PandasGroupedOpsMixin</span>
<span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql._typing</span><span class="w"> </span><span class="kn">import</span> <span class="n">LiteralType</span>
<span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;GroupedData&quot;</span><span class="p">]</span>
<span class="k">def</span><span class="w"> </span><span class="nf">dfapi</span><span class="p">(</span><span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">]:</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_api</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">&quot;GroupedData&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="n">name</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="vm">__name__</span>
<span class="n">jdf</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jgd</span><span class="p">,</span> <span class="n">name</span><span class="p">)()</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">session</span><span class="p">)</span>
<span class="n">_api</span><span class="o">.</span><span class="vm">__name__</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="vm">__name__</span>
<span class="n">_api</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="vm">__doc__</span>
<span class="k">return</span> <span class="n">_api</span>
<span class="k">def</span><span class="w"> </span><span class="nf">df_varargs_api</span><span class="p">(</span><span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">]:</span>
<span class="k">def</span><span class="w"> </span><span class="nf">_api</span><span class="p">(</span><span class="bp">self</span><span class="p">:</span> <span class="s2">&quot;GroupedData&quot;</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="n">name</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="vm">__name__</span>
<span class="n">jdf</span> <span class="o">=</span> <span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jgd</span><span class="p">,</span> <span class="n">name</span><span class="p">)(</span><span class="n">_to_seq</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">session</span><span class="o">.</span><span class="n">_sc</span><span class="p">,</span> <span class="n">cols</span><span class="p">))</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">session</span><span class="p">)</span>
<span class="n">_api</span><span class="o">.</span><span class="vm">__name__</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="vm">__name__</span>
<span class="n">_api</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">f</span><span class="o">.</span><span class="vm">__doc__</span>
<span class="k">return</span> <span class="n">_api</span>
<div class="viewcode-block" id="GroupedData"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.GroupedData.html#pyspark.sql.GroupedData">[docs]</a><span class="k">class</span><span class="w"> </span><span class="nc">GroupedData</span><span class="p">(</span><span class="n">PandasGroupedOpsMixin</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> A set of methods for aggregations on a :class:`DataFrame`,</span>
<span class="sd"> created by :func:`DataFrame.groupBy`.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">jgd</span><span class="p">:</span> <span class="n">JavaObject</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_jgd</span> <span class="o">=</span> <span class="n">jgd</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_df</span> <span class="o">=</span> <span class="n">df</span>
<span class="bp">self</span><span class="o">.</span><span class="n">session</span><span class="p">:</span> <span class="n">SparkSession</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">sparkSession</span>
<span class="k">def</span><span class="w"> </span><span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="n">index</span> <span class="o">=</span> <span class="mi">26</span> <span class="c1"># index to truncate string from the JVM side</span>
<span class="n">jvm_string</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jgd</span><span class="o">.</span><span class="n">toString</span><span class="p">()</span>
<span class="k">if</span> <span class="n">jvm_string</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">jvm_string</span><span class="p">)</span> <span class="o">&gt;</span> <span class="n">index</span> <span class="ow">and</span> <span class="n">jvm_string</span><span class="p">[</span><span class="n">index</span><span class="p">]</span> <span class="o">==</span> <span class="s2">&quot;[&quot;</span><span class="p">:</span>
<span class="k">return</span> <span class="sa">f</span><span class="s2">&quot;GroupedData</span><span class="si">{</span><span class="n">jvm_string</span><span class="p">[</span><span class="n">index</span><span class="p">:]</span><span class="si">}</span><span class="s2">&quot;</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="nb">super</span><span class="p">()</span><span class="o">.</span><span class="fm">__repr__</span><span class="p">()</span>
<span class="nd">@overload</span>
<span class="k">def</span><span class="w"> </span><span class="nf">agg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">exprs</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span><span class="w"> </span><span class="nf">agg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">__exprs</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="GroupedData.agg"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.GroupedData.agg.html#pyspark.sql.GroupedData.agg">[docs]</a> <span class="k">def</span><span class="w"> </span><span class="nf">agg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">exprs</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Compute aggregates and returns the result as a :class:`DataFrame`.</span>
<span class="sd"> The available aggregate functions can be:</span>
<span class="sd"> 1. built-in aggregation functions, such as `avg`, `max`, `min`, `sum`, `count`</span>
<span class="sd"> 2. group aggregate pandas UDFs, created with :func:`pyspark.sql.functions.pandas_udf`</span>
<span class="sd"> .. note:: There is no partial aggregation with group aggregate UDFs, i.e.,</span>
<span class="sd"> a full shuffle is required. Also, all the data of a group will be loaded into</span>
<span class="sd"> memory, so the user should be aware of the potential OOM risk if data is skewed</span>
<span class="sd"> and certain groups are too large to fit in memory.</span>
<span class="sd"> .. seealso:: :func:`pyspark.sql.functions.pandas_udf`</span>
<span class="sd"> If ``exprs`` is a single :class:`dict` mapping from string to string, then the key</span>
<span class="sd"> is the column to perform aggregation on, and the value is the aggregate function.</span>
<span class="sd"> Alternatively, ``exprs`` can also be a list of aggregate :class:`Column` expressions.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> exprs : dict</span>
<span class="sd"> a dict mapping from column name (string) to aggregate functions (string),</span>
<span class="sd"> or a list of :class:`Column`.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Built-in aggregation functions and group aggregate pandas UDFs cannot be mixed</span>
<span class="sd"> in a single call to this function.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import functions as sf</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import pandas_udf, PandasUDFType</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(2, &quot;Alice&quot;), (3, &quot;Alice&quot;), (5, &quot;Bob&quot;), (10, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 3|Alice|</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> | 10| Bob|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> Group-by name, and count each group.</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy(df.name)</span>
<span class="sd"> GroupedData[grouping...: [name...], value: [age: bigint, name: string], type: GroupBy]</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy(df.name).agg({&quot;*&quot;: &quot;count&quot;}).sort(&quot;name&quot;).show()</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> | name|count(1)|</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> |Alice| 2|</span>
<span class="sd"> | Bob| 2|</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> Group-by name, and calculate the minimum age.</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy(df.name).agg(sf.min(df.age)).sort(&quot;name&quot;).show()</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> | name|min(age)|</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> |Alice| 2|</span>
<span class="sd"> | Bob| 5|</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> Same as above but uses pandas UDF.</span>
<span class="sd"> &gt;&gt;&gt; @pandas_udf(&#39;int&#39;, PandasUDFType.GROUPED_AGG) # doctest: +SKIP</span>
<span class="sd"> ... def min_udf(v):</span>
<span class="sd"> ... return v.min()</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy(df.name).agg(min_udf(df.age)).sort(&quot;name&quot;).show() # doctest: +SKIP</span>
<span class="sd"> +-----+------------+</span>
<span class="sd"> | name|min_udf(age)|</span>
<span class="sd"> +-----+------------+</span>
<span class="sd"> |Alice| 2|</span>
<span class="sd"> | Bob| 5|</span>
<span class="sd"> +-----+------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">assert</span> <span class="n">exprs</span><span class="p">,</span> <span class="s2">&quot;exprs should not be empty&quot;</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">exprs</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">exprs</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="nb">dict</span><span class="p">):</span>
<span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jgd</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="n">exprs</span><span class="p">[</span><span class="mi">0</span><span class="p">])</span>
<span class="k">else</span><span class="p">:</span>
<span class="c1"># Columns</span>
<span class="k">assert</span> <span class="nb">all</span><span class="p">(</span><span class="nb">isinstance</span><span class="p">(</span><span class="n">c</span><span class="p">,</span> <span class="n">Column</span><span class="p">)</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">exprs</span><span class="p">),</span> <span class="s2">&quot;all exprs should be Column&quot;</span>
<span class="n">exprs</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Tuple</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="o">...</span><span class="p">],</span> <span class="n">exprs</span><span class="p">)</span>
<span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jgd</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="n">exprs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">.</span><span class="n">_jc</span><span class="p">,</span> <span class="n">_to_seq</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">session</span><span class="o">.</span><span class="n">_sc</span><span class="p">,</span> <span class="p">[</span><span class="n">c</span><span class="o">.</span><span class="n">_jc</span> <span class="k">for</span> <span class="n">c</span> <span class="ow">in</span> <span class="n">exprs</span><span class="p">[</span><span class="mi">1</span><span class="p">:]]))</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">session</span><span class="p">)</span></div>
<div class="viewcode-block" id="GroupedData.count"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.GroupedData.count.html#pyspark.sql.GroupedData.count">[docs]</a> <span class="nd">@dfapi</span>
<span class="k">def</span><span class="w"> </span><span class="nf">count</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Counts the number of records for each group.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(2, &quot;Alice&quot;), (3, &quot;Alice&quot;), (5, &quot;Bob&quot;), (10, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 3|Alice|</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> | 10| Bob|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> Group-by name, and count each group.</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy(df.name).count().sort(&quot;name&quot;).show()</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> | name|count|</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> |Alice| 2|</span>
<span class="sd"> | Bob| 2|</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span></div>
<div class="viewcode-block" id="GroupedData.mean"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.GroupedData.mean.html#pyspark.sql.GroupedData.mean">[docs]</a> <span class="nd">@df_varargs_api</span>
<span class="k">def</span><span class="w"> </span><span class="nf">mean</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Computes average values for each numeric columns for each group.</span>
<span class="sd"> :func:`mean` is an alias for :func:`avg`.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : str</span>
<span class="sd"> column names. Non-numeric columns are ignored.</span>
<span class="sd"> &quot;&quot;&quot;</span></div>
<div class="viewcode-block" id="GroupedData.avg"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.GroupedData.avg.html#pyspark.sql.GroupedData.avg">[docs]</a> <span class="nd">@df_varargs_api</span>
<span class="k">def</span><span class="w"> </span><span class="nf">avg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Computes average values for each numeric columns for each group.</span>
<span class="sd"> :func:`mean` is an alias for :func:`avg`.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : str</span>
<span class="sd"> column names. Non-numeric columns are ignored.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (2, &quot;Alice&quot;, 80), (3, &quot;Alice&quot;, 100),</span>
<span class="sd"> ... (5, &quot;Bob&quot;, 120), (10, &quot;Bob&quot;, 140)], [&quot;age&quot;, &quot;name&quot;, &quot;height&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.show()</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> |age| name|height|</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> | 2|Alice| 80|</span>
<span class="sd"> | 3|Alice| 100|</span>
<span class="sd"> | 5| Bob| 120|</span>
<span class="sd"> | 10| Bob| 140|</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> Group-by name, and calculate the mean of the age in each group.</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy(&quot;name&quot;).avg(&#39;age&#39;).sort(&quot;name&quot;).show()</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> | name|avg(age)|</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> |Alice| 2.5|</span>
<span class="sd"> | Bob| 7.5|</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> Calculate the mean of the age and height in all data.</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy().avg(&#39;age&#39;, &#39;height&#39;).show()</span>
<span class="sd"> +--------+-----------+</span>
<span class="sd"> |avg(age)|avg(height)|</span>
<span class="sd"> +--------+-----------+</span>
<span class="sd"> | 5.0| 110.0|</span>
<span class="sd"> +--------+-----------+</span>
<span class="sd"> &quot;&quot;&quot;</span></div>
<div class="viewcode-block" id="GroupedData.max"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.GroupedData.max.html#pyspark.sql.GroupedData.max">[docs]</a> <span class="nd">@df_varargs_api</span>
<span class="k">def</span><span class="w"> </span><span class="nf">max</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Computes the max value for each numeric columns for each group.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (2, &quot;Alice&quot;, 80), (3, &quot;Alice&quot;, 100),</span>
<span class="sd"> ... (5, &quot;Bob&quot;, 120), (10, &quot;Bob&quot;, 140)], [&quot;age&quot;, &quot;name&quot;, &quot;height&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.show()</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> |age| name|height|</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> | 2|Alice| 80|</span>
<span class="sd"> | 3|Alice| 100|</span>
<span class="sd"> | 5| Bob| 120|</span>
<span class="sd"> | 10| Bob| 140|</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> Group-by name, and calculate the max of the age in each group.</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy(&quot;name&quot;).max(&quot;age&quot;).sort(&quot;name&quot;).show()</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> | name|max(age)|</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> |Alice| 3|</span>
<span class="sd"> | Bob| 10|</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> Calculate the max of the age and height in all data.</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy().max(&quot;age&quot;, &quot;height&quot;).show()</span>
<span class="sd"> +--------+-----------+</span>
<span class="sd"> |max(age)|max(height)|</span>
<span class="sd"> +--------+-----------+</span>
<span class="sd"> | 10| 140|</span>
<span class="sd"> +--------+-----------+</span>
<span class="sd"> &quot;&quot;&quot;</span></div>
<div class="viewcode-block" id="GroupedData.min"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.GroupedData.min.html#pyspark.sql.GroupedData.min">[docs]</a> <span class="nd">@df_varargs_api</span>
<span class="k">def</span><span class="w"> </span><span class="nf">min</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Computes the min value for each numeric column for each group.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : str</span>
<span class="sd"> column names. Non-numeric columns are ignored.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (2, &quot;Alice&quot;, 80), (3, &quot;Alice&quot;, 100),</span>
<span class="sd"> ... (5, &quot;Bob&quot;, 120), (10, &quot;Bob&quot;, 140)], [&quot;age&quot;, &quot;name&quot;, &quot;height&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.show()</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> |age| name|height|</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> | 2|Alice| 80|</span>
<span class="sd"> | 3|Alice| 100|</span>
<span class="sd"> | 5| Bob| 120|</span>
<span class="sd"> | 10| Bob| 140|</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> Group-by name, and calculate the min of the age in each group.</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy(&quot;name&quot;).min(&quot;age&quot;).sort(&quot;name&quot;).show()</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> | name|min(age)|</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> |Alice| 2|</span>
<span class="sd"> | Bob| 5|</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> Calculate the min of the age and height in all data.</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy().min(&quot;age&quot;, &quot;height&quot;).show()</span>
<span class="sd"> +--------+-----------+</span>
<span class="sd"> |min(age)|min(height)|</span>
<span class="sd"> +--------+-----------+</span>
<span class="sd"> | 2| 80|</span>
<span class="sd"> +--------+-----------+</span>
<span class="sd"> &quot;&quot;&quot;</span></div>
<div class="viewcode-block" id="GroupedData.sum"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.GroupedData.sum.html#pyspark.sql.GroupedData.sum">[docs]</a> <span class="nd">@df_varargs_api</span>
<span class="k">def</span><span class="w"> </span><span class="nf">sum</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Computes the sum for each numeric columns for each group.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : str</span>
<span class="sd"> column names. Non-numeric columns are ignored.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (2, &quot;Alice&quot;, 80), (3, &quot;Alice&quot;, 100),</span>
<span class="sd"> ... (5, &quot;Bob&quot;, 120), (10, &quot;Bob&quot;, 140)], [&quot;age&quot;, &quot;name&quot;, &quot;height&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.show()</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> |age| name|height|</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> | 2|Alice| 80|</span>
<span class="sd"> | 3|Alice| 100|</span>
<span class="sd"> | 5| Bob| 120|</span>
<span class="sd"> | 10| Bob| 140|</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> Group-by name, and calculate the sum of the age in each group.</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy(&quot;name&quot;).sum(&quot;age&quot;).sort(&quot;name&quot;).show()</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> | name|sum(age)|</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> |Alice| 5|</span>
<span class="sd"> | Bob| 15|</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> Calculate the sum of the age and height in all data.</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy().sum(&quot;age&quot;, &quot;height&quot;).show()</span>
<span class="sd"> +--------+-----------+</span>
<span class="sd"> |sum(age)|sum(height)|</span>
<span class="sd"> +--------+-----------+</span>
<span class="sd"> | 20| 440|</span>
<span class="sd"> +--------+-----------+</span>
<span class="sd"> &quot;&quot;&quot;</span></div>
<span class="c1"># TODO(SPARK-41746): SparkSession.createDataFrame does not support nested datatypes</span>
<div class="viewcode-block" id="GroupedData.pivot"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.GroupedData.pivot.html#pyspark.sql.GroupedData.pivot">[docs]</a> <span class="k">def</span><span class="w"> </span><span class="nf">pivot</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">pivot_col</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">values</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">&quot;LiteralType&quot;</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GroupedData&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Pivots a column of the current :class:`DataFrame` and perform the specified aggregation.</span>
<span class="sd"> There are two versions of the pivot function: one that requires the caller</span>
<span class="sd"> to specify the list of distinct values to pivot on, and one that does not.</span>
<span class="sd"> The latter is more concise but less efficient,</span>
<span class="sd"> because Spark needs to first compute the list of distinct values internally.</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> pivot_col : str</span>
<span class="sd"> Name of the column to pivot.</span>
<span class="sd"> values : list, optional</span>
<span class="sd"> List of values that will be translated to columns in the output DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([</span>
<span class="sd"> ... Row(course=&quot;dotNET&quot;, year=2012, earnings=10000),</span>
<span class="sd"> ... Row(course=&quot;Java&quot;, year=2012, earnings=20000),</span>
<span class="sd"> ... Row(course=&quot;dotNET&quot;, year=2012, earnings=5000),</span>
<span class="sd"> ... Row(course=&quot;dotNET&quot;, year=2013, earnings=48000),</span>
<span class="sd"> ... Row(course=&quot;Java&quot;, year=2013, earnings=30000),</span>
<span class="sd"> ... ])</span>
<span class="sd"> &gt;&gt;&gt; df1.show()</span>
<span class="sd"> +------+----+--------+</span>
<span class="sd"> |course|year|earnings|</span>
<span class="sd"> +------+----+--------+</span>
<span class="sd"> |dotNET|2012| 10000|</span>
<span class="sd"> | Java|2012| 20000|</span>
<span class="sd"> |dotNET|2012| 5000|</span>
<span class="sd"> |dotNET|2013| 48000|</span>
<span class="sd"> | Java|2013| 30000|</span>
<span class="sd"> +------+----+--------+</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([</span>
<span class="sd"> ... Row(training=&quot;expert&quot;, sales=Row(course=&quot;dotNET&quot;, year=2012, earnings=10000)),</span>
<span class="sd"> ... Row(training=&quot;junior&quot;, sales=Row(course=&quot;Java&quot;, year=2012, earnings=20000)),</span>
<span class="sd"> ... Row(training=&quot;expert&quot;, sales=Row(course=&quot;dotNET&quot;, year=2012, earnings=5000)),</span>
<span class="sd"> ... Row(training=&quot;junior&quot;, sales=Row(course=&quot;dotNET&quot;, year=2013, earnings=48000)),</span>
<span class="sd"> ... Row(training=&quot;expert&quot;, sales=Row(course=&quot;Java&quot;, year=2013, earnings=30000)),</span>
<span class="sd"> ... ]) # doctest: +SKIP</span>
<span class="sd"> &gt;&gt;&gt; df2.show() # doctest: +SKIP</span>
<span class="sd"> +--------+--------------------+</span>
<span class="sd"> |training| sales|</span>
<span class="sd"> +--------+--------------------+</span>
<span class="sd"> | expert|{dotNET, 2012, 10...|</span>
<span class="sd"> | junior| {Java, 2012, 20000}|</span>
<span class="sd"> | expert|{dotNET, 2012, 5000}|</span>
<span class="sd"> | junior|{dotNET, 2013, 48...|</span>
<span class="sd"> | expert| {Java, 2013, 30000}|</span>
<span class="sd"> +--------+--------------------+</span>
<span class="sd"> Compute the sum of earnings for each year by course with each course as a separate column</span>
<span class="sd"> &gt;&gt;&gt; df1.groupBy(&quot;year&quot;).pivot(&quot;course&quot;, [&quot;dotNET&quot;, &quot;Java&quot;]).sum(&quot;earnings&quot;).show()</span>
<span class="sd"> +----+------+-----+</span>
<span class="sd"> |year|dotNET| Java|</span>
<span class="sd"> +----+------+-----+</span>
<span class="sd"> |2012| 15000|20000|</span>
<span class="sd"> |2013| 48000|30000|</span>
<span class="sd"> +----+------+-----+</span>
<span class="sd"> Or without specifying column values (less efficient)</span>
<span class="sd"> &gt;&gt;&gt; df1.groupBy(&quot;year&quot;).pivot(&quot;course&quot;).sum(&quot;earnings&quot;).show()</span>
<span class="sd"> +----+-----+------+</span>
<span class="sd"> |year| Java|dotNET|</span>
<span class="sd"> +----+-----+------+</span>
<span class="sd"> |2012|20000| 15000|</span>
<span class="sd"> |2013|30000| 48000|</span>
<span class="sd"> +----+-----+------+</span>
<span class="sd"> &gt;&gt;&gt; df2.groupBy(&quot;sales.year&quot;).pivot(&quot;sales.course&quot;).sum(&quot;sales.earnings&quot;).show()</span>
<span class="sd"> ... # doctest: +SKIP</span>
<span class="sd"> +----+-----+------+</span>
<span class="sd"> |year| Java|dotNET|</span>
<span class="sd"> +----+-----+------+</span>
<span class="sd"> |2012|20000| 15000|</span>
<span class="sd"> |2013|30000| 48000|</span>
<span class="sd"> +----+-----+------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">values</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">jgd</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jgd</span><span class="o">.</span><span class="n">pivot</span><span class="p">(</span><span class="n">pivot_col</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">jgd</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jgd</span><span class="o">.</span><span class="n">pivot</span><span class="p">(</span><span class="n">pivot_col</span><span class="p">,</span> <span class="n">values</span><span class="p">)</span>
<span class="k">return</span> <span class="n">GroupedData</span><span class="p">(</span><span class="n">jgd</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_df</span><span class="p">)</span></div></div>
<span class="k">def</span><span class="w"> </span><span class="nf">_test</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">doctest</span>
<span class="kn">from</span><span class="w"> </span><span class="nn">pyspark.sql</span><span class="w"> </span><span class="kn">import</span> <span class="n">SparkSession</span>
<span class="kn">import</span><span class="w"> </span><span class="nn">pyspark.sql.group</span>
<span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">group</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">&quot;local[4]&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">&quot;sql.group tests&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;spark&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">spark</span>
<span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span>
<span class="n">pyspark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">group</span><span class="p">,</span>
<span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span>
<span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">REPORT_NDIFF</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">spark</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
<span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span>
<span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
<span class="n">_test</span><span class="p">()</span>
</pre></div>
</div>
<!-- Previous / next buttons -->
<div class='prev-next-area'>
</div>
</main>
</div>
</div>
<script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf"></script>
<footer class="footer mt-5 mt-md-0">
<div class="container">
<div class="footer-item">
<p class="copyright">
&copy; Copyright .<br>
</p>
</div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br>
</p>
</div>
</div>
</footer>
</body>
</html>