blob: c5eddacc02c1391b39c067a0bd2c85155e4e984c [file] [log] [blame]
<!DOCTYPE html>
<html >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>pyspark.sql.dataframe &#8212; PySpark 4.0.0-preview1 documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../../../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52" />
<link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" />
<script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/jquery.js"></script>
<script src="../../../_static/underscore.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/clipboard.min.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = '_modules/pyspark/sql/dataframe';</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/sql/dataframe.html" />
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a class="skip-link" href="#main-content">Skip to main content</a>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../../../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<nav class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../../../index.html">
<img src="../../../_static/spark-logo-light.png" class="logo__image only-light" alt="Logo image"/>
<script>document.write(`<img src="../../../_static/spark-logo-dark.png" class="logo__image only-dark" alt="Logo image"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/sql/dataframe.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
</div>
</nav>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar hide-on-wide">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/sql/dataframe.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumbs">
<ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb">
<li class="breadcrumb-item breadcrumb-home">
<a href="../../../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="../../index.html" class="nav-link">Module code</a></li>
<li class="breadcrumb-item active" aria-current="page">pyspark.sql.dataframe</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article" role="main">
<h1>Source code for pyspark.sql.dataframe</h1><div class="highlight"><pre>
<span></span><span class="c1">#</span>
<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c1"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c1"># this work for additional information regarding copyright ownership.</span>
<span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c1"># (the &quot;License&quot;); you may not use this file except in compliance with</span>
<span class="c1"># the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c1"># distributed under the License is distributed on an &quot;AS IS&quot; BASIS,</span>
<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="c1">#</span>
<span class="c1"># mypy: disable-error-code=&quot;empty-body&quot;</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">Any</span><span class="p">,</span>
<span class="n">Callable</span><span class="p">,</span>
<span class="n">Dict</span><span class="p">,</span>
<span class="n">Iterator</span><span class="p">,</span>
<span class="n">List</span><span class="p">,</span>
<span class="n">Optional</span><span class="p">,</span>
<span class="n">Sequence</span><span class="p">,</span>
<span class="n">Tuple</span><span class="p">,</span>
<span class="n">Union</span><span class="p">,</span>
<span class="n">overload</span><span class="p">,</span>
<span class="n">TYPE_CHECKING</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">_NoValue</span>
<span class="kn">from</span> <span class="nn">pyspark._globals</span> <span class="kn">import</span> <span class="n">_NoValueType</span>
<span class="kn">from</span> <span class="nn">pyspark.util</span> <span class="kn">import</span> <span class="n">is_remote_only</span>
<span class="kn">from</span> <span class="nn">pyspark.storagelevel</span> <span class="kn">import</span> <span class="n">StorageLevel</span>
<span class="kn">from</span> <span class="nn">pyspark.resource</span> <span class="kn">import</span> <span class="n">ResourceProfile</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.column</span> <span class="kn">import</span> <span class="n">Column</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.readwriter</span> <span class="kn">import</span> <span class="n">DataFrameWriter</span><span class="p">,</span> <span class="n">DataFrameWriterV2</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.streaming</span> <span class="kn">import</span> <span class="n">DataStreamWriter</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">StructType</span><span class="p">,</span> <span class="n">Row</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.utils</span> <span class="kn">import</span> <span class="n">dispatch_df_method</span>
<span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">py4j.java_gateway</span> <span class="kn">import</span> <span class="n">JavaObject</span>
<span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span>
<span class="kn">from</span> <span class="nn">pyspark.core.context</span> <span class="kn">import</span> <span class="n">SparkContext</span>
<span class="kn">from</span> <span class="nn">pyspark.core.rdd</span> <span class="kn">import</span> <span class="n">RDD</span>
<span class="kn">from</span> <span class="nn">pyspark._typing</span> <span class="kn">import</span> <span class="n">PrimitiveType</span>
<span class="kn">from</span> <span class="nn">pyspark.pandas.frame</span> <span class="kn">import</span> <span class="n">DataFrame</span> <span class="k">as</span> <span class="n">PandasOnSparkDataFrame</span>
<span class="kn">from</span> <span class="nn">pyspark.sql._typing</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">ColumnOrName</span><span class="p">,</span>
<span class="n">ColumnOrNameOrOrdinal</span><span class="p">,</span>
<span class="n">LiteralType</span><span class="p">,</span>
<span class="n">OptionalPrimitiveType</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.context</span> <span class="kn">import</span> <span class="n">SQLContext</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.session</span> <span class="kn">import</span> <span class="n">SparkSession</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.group</span> <span class="kn">import</span> <span class="n">GroupedData</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.observation</span> <span class="kn">import</span> <span class="n">Observation</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.pandas._typing</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">PandasMapIterFunction</span><span class="p">,</span>
<span class="n">ArrowMapIterFunction</span><span class="p">,</span>
<span class="n">DataFrameLike</span> <span class="k">as</span> <span class="n">PandasDataFrameLike</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;DataFrame&quot;</span><span class="p">,</span> <span class="s2">&quot;DataFrameNaFunctions&quot;</span><span class="p">,</span> <span class="s2">&quot;DataFrameStatFunctions&quot;</span><span class="p">]</span>
<div class="viewcode-block" id="DataFrame"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.html#pyspark.sql.DataFrame">[docs]</a><span class="k">class</span> <span class="nc">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;A distributed collection of data grouped into named columns.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> A :class:`DataFrame` is equivalent to a relational table in Spark SQL,</span>
<span class="sd"> and can be created using various functions in :class:`SparkSession`:</span>
<span class="sd"> &gt;&gt;&gt; people = spark.createDataFrame([</span>
<span class="sd"> ... {&quot;deptId&quot;: 1, &quot;age&quot;: 40, &quot;name&quot;: &quot;Hyukjin Kwon&quot;, &quot;gender&quot;: &quot;M&quot;, &quot;salary&quot;: 50},</span>
<span class="sd"> ... {&quot;deptId&quot;: 1, &quot;age&quot;: 50, &quot;name&quot;: &quot;Takuya Ueshin&quot;, &quot;gender&quot;: &quot;M&quot;, &quot;salary&quot;: 100},</span>
<span class="sd"> ... {&quot;deptId&quot;: 2, &quot;age&quot;: 60, &quot;name&quot;: &quot;Xinrong Meng&quot;, &quot;gender&quot;: &quot;F&quot;, &quot;salary&quot;: 150},</span>
<span class="sd"> ... {&quot;deptId&quot;: 3, &quot;age&quot;: 20, &quot;name&quot;: &quot;Haejoon Lee&quot;, &quot;gender&quot;: &quot;M&quot;, &quot;salary&quot;: 200}</span>
<span class="sd"> ... ])</span>
<span class="sd"> Once created, it can be manipulated using the various domain-specific-language</span>
<span class="sd"> (DSL) functions defined in: :class:`DataFrame`, :class:`Column`.</span>
<span class="sd"> To select a column from the :class:`DataFrame`, use the apply method:</span>
<span class="sd"> &gt;&gt;&gt; age_col = people.age</span>
<span class="sd"> A more concrete example:</span>
<span class="sd"> &gt;&gt;&gt; # To create DataFrame using SparkSession</span>
<span class="sd"> ... department = spark.createDataFrame([</span>
<span class="sd"> ... {&quot;id&quot;: 1, &quot;name&quot;: &quot;PySpark&quot;},</span>
<span class="sd"> ... {&quot;id&quot;: 2, &quot;name&quot;: &quot;ML&quot;},</span>
<span class="sd"> ... {&quot;id&quot;: 3, &quot;name&quot;: &quot;Spark SQL&quot;}</span>
<span class="sd"> ... ])</span>
<span class="sd"> &gt;&gt;&gt; people.filter(people.age &gt; 30).join(</span>
<span class="sd"> ... department, people.deptId == department.id).groupBy(</span>
<span class="sd"> ... department.name, &quot;gender&quot;).agg(</span>
<span class="sd"> ... {&quot;salary&quot;: &quot;avg&quot;, &quot;age&quot;: &quot;max&quot;}).sort(&quot;max(age)&quot;).show()</span>
<span class="sd"> +-------+------+-----------+--------+</span>
<span class="sd"> | name|gender|avg(salary)|max(age)|</span>
<span class="sd"> +-------+------+-----------+--------+</span>
<span class="sd"> |PySpark| M| 75.0| 50|</span>
<span class="sd"> | ML| F| 150.0| 60|</span>
<span class="sd"> +-------+------+-----------+--------+</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> A DataFrame should only be created as described above. It should not be directly</span>
<span class="sd"> created via using the constructor.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># HACK ALERT!! this is to reduce the backward compatibility concern, and returns</span>
<span class="c1"># Spark Classic DataFrame by default. This is NOT an API, and NOT supposed to</span>
<span class="c1"># be directly invoked. DO NOT use this constructor.</span>
<span class="n">_sql_ctx</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;SQLContext&quot;</span><span class="p">]</span>
<span class="n">_session</span><span class="p">:</span> <span class="s2">&quot;SparkSession&quot;</span>
<span class="n">_sc</span><span class="p">:</span> <span class="s2">&quot;SparkContext&quot;</span>
<span class="n">_jdf</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span>
<span class="n">is_cached</span><span class="p">:</span> <span class="nb">bool</span>
<span class="n">_schema</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">StructType</span><span class="p">]</span>
<span class="n">_lazy_rdd</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;RDD[Row]&quot;</span><span class="p">]</span>
<span class="n">_support_repr_html</span><span class="p">:</span> <span class="nb">bool</span>
<span class="k">def</span> <span class="fm">__new__</span><span class="p">(</span>
<span class="bp">cls</span><span class="p">,</span>
<span class="n">jdf</span><span class="p">:</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">,</span>
<span class="n">sql_ctx</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;SQLContext&quot;</span><span class="p">,</span> <span class="s2">&quot;SparkSession&quot;</span><span class="p">],</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.classic.dataframe</span> <span class="kn">import</span> <span class="n">DataFrame</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="o">.</span><span class="fm">__new__</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">jdf</span><span class="p">,</span> <span class="n">sql_ctx</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">sparkSession</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SparkSession&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns Spark session that created this :class:`DataFrame`.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`SparkSession`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; type(df.sparkSession)</span>
<span class="sd"> &lt;class &#39;...session.SparkSession&#39;&gt;</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_remote_only</span><span class="p">():</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">rdd</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RDD[Row]&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the content as an :class:`pyspark.RDD` of :class:`Row`.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`RDD`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; type(df.rdd)</span>
<span class="sd"> &lt;class &#39;pyspark.core.rdd.RDD&#39;&gt;</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">na</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrameNaFunctions&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a :class:`DataFrameNaFunctions` for handling missing values.</span>
<span class="sd"> .. versionadded:: 1.3.1</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrameNaFunctions`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.sql(&quot;SELECT 1 AS c1, int(NULL) AS c2&quot;)</span>
<span class="sd"> &gt;&gt;&gt; type(df.na)</span>
<span class="sd"> &lt;class &#39;...dataframe.DataFrameNaFunctions&#39;&gt;</span>
<span class="sd"> Replace the missing values as 2.</span>
<span class="sd"> &gt;&gt;&gt; df.na.fill(2).show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | c1| c2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | 1| 2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">stat</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrameStatFunctions&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a :class:`DataFrameStatFunctions` for statistic functions.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrameStatFunctions`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as f</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(3).withColumn(&quot;c&quot;, f.expr(&quot;id + 1&quot;))</span>
<span class="sd"> &gt;&gt;&gt; type(df.stat)</span>
<span class="sd"> &lt;class &#39;...dataframe.DataFrameStatFunctions&#39;&gt;</span>
<span class="sd"> &gt;&gt;&gt; df.stat.corr(&quot;id&quot;, &quot;c&quot;)</span>
<span class="sd"> 1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_remote_only</span><span class="p">():</span>
<div class="viewcode-block" id="DataFrame.toJSON"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.toJSON.html#pyspark.sql.DataFrame.toJSON">[docs]</a> <span class="k">def</span> <span class="nf">toJSON</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">use_unicode</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;RDD[str]&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Converts a :class:`DataFrame` into a :class:`RDD` of string.</span>
<span class="sd"> Each row is turned into a JSON document as one element in the returned RDD.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> use_unicode : bool, optional, default True</span>
<span class="sd"> Whether to convert to unicode or not.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`RDD`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.toJSON().first()</span>
<span class="sd"> &#39;{&quot;age&quot;:2,&quot;name&quot;:&quot;Alice&quot;}&#39;</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.registerTempTable"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.registerTempTable.html#pyspark.sql.DataFrame.registerTempTable">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">registerTempTable</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Registers this :class:`DataFrame` as a temporary table using the given name.</span>
<span class="sd"> The lifetime of this temporary table is tied to the :class:`SparkSession`</span>
<span class="sd"> that was used to create this :class:`DataFrame`.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> .. deprecated:: 2.0.0</span>
<span class="sd"> Use :meth:`DataFrame.createOrReplaceTempView` instead.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> name : str</span>
<span class="sd"> Name of the temporary table to register.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.registerTempTable(&quot;people&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.sql(&quot;SELECT * FROM people&quot;)</span>
<span class="sd"> &gt;&gt;&gt; sorted(df.collect()) == sorted(df2.collect())</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; spark.catalog.dropTempView(&quot;people&quot;)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.createTempView"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.createTempView.html#pyspark.sql.DataFrame.createTempView">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">createTempView</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Creates a local temporary view with this :class:`DataFrame`.</span>
<span class="sd"> The lifetime of this temporary table is tied to the :class:`SparkSession`</span>
<span class="sd"> that was used to create this :class:`DataFrame`.</span>
<span class="sd"> throws :class:`TempTableAlreadyExistsException`, if the view name already exists in the</span>
<span class="sd"> catalog.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> name : str</span>
<span class="sd"> Name of the view.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Example 1: Creating and querying a local temporary view</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.createTempView(&quot;people&quot;)</span>
<span class="sd"> &gt;&gt;&gt; spark.sql(&quot;SELECT * FROM people&quot;).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> Example 2: Attempting to create a temporary view with an existing name</span>
<span class="sd"> &gt;&gt;&gt; df.createTempView(&quot;people&quot;) # doctest: +IGNORE_EXCEPTION_DETAIL</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> AnalysisException: &quot;Temporary table &#39;people&#39; already exists;&quot;</span>
<span class="sd"> Example 3: Creating and dropping a local temporary view</span>
<span class="sd"> &gt;&gt;&gt; spark.catalog.dropTempView(&quot;people&quot;)</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; df.createTempView(&quot;people&quot;)</span>
<span class="sd"> Example 4: Creating temporary views with multiple DataFrames with</span>
<span class="sd"> :meth:`SparkSession.table`</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(1, &quot;John&quot;), (2, &quot;Jane&quot;)], schema=[&quot;id&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(3, &quot;Jake&quot;), (4, &quot;Jill&quot;)], schema=[&quot;id&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df1.createTempView(&quot;table1&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df2.createTempView(&quot;table2&quot;)</span>
<span class="sd"> &gt;&gt;&gt; result_df = spark.table(&quot;table1&quot;).union(spark.table(&quot;table2&quot;))</span>
<span class="sd"> &gt;&gt;&gt; result_df.show()</span>
<span class="sd"> +---+----+</span>
<span class="sd"> | id|name|</span>
<span class="sd"> +---+----+</span>
<span class="sd"> | 1|John|</span>
<span class="sd"> | 2|Jane|</span>
<span class="sd"> | 3|Jake|</span>
<span class="sd"> | 4|Jill|</span>
<span class="sd"> +---+----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.createOrReplaceTempView"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.createOrReplaceTempView.html#pyspark.sql.DataFrame.createOrReplaceTempView">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">createOrReplaceTempView</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Creates or replaces a local temporary view with this :class:`DataFrame`.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> name : str</span>
<span class="sd"> Name of the view.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The lifetime of this temporary table is tied to the :class:`SparkSession`</span>
<span class="sd"> that was used to create this :class:`DataFrame`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Example 1: Creating a local temporary view named &#39;people&#39;.</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.createOrReplaceTempView(&quot;people&quot;)</span>
<span class="sd"> Example 2: Replacing the local temporary view.</span>
<span class="sd"> &gt;&gt;&gt; df2 = df.filter(df.age &gt; 3)</span>
<span class="sd"> &gt;&gt;&gt; # Replace the local temporary view with the filtered DataFrame</span>
<span class="sd"> &gt;&gt;&gt; df2.createOrReplaceTempView(&quot;people&quot;)</span>
<span class="sd"> &gt;&gt;&gt; # Query the temporary view</span>
<span class="sd"> &gt;&gt;&gt; df3 = spark.sql(&quot;SELECT * FROM people&quot;)</span>
<span class="sd"> &gt;&gt;&gt; # Check if the DataFrames are equal</span>
<span class="sd"> ... assert sorted(df3.collect()) == sorted(df2.collect())</span>
<span class="sd"> Example 3: Dropping the temporary view.</span>
<span class="sd"> &gt;&gt;&gt; # Drop the local temporary view</span>
<span class="sd"> ... spark.catalog.dropTempView(&quot;people&quot;)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.createGlobalTempView"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.createGlobalTempView.html#pyspark.sql.DataFrame.createGlobalTempView">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">createGlobalTempView</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Creates a global temporary view with this :class:`DataFrame`.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> name : str</span>
<span class="sd"> Name of the view.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The lifetime of this temporary view is tied to this Spark application.</span>
<span class="sd"> throws :class:`TempTableAlreadyExistsException`, if the view name already exists in the</span>
<span class="sd"> catalog.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Example 1: Creating and querying a global temporary view</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.createGlobalTempView(&quot;people&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.sql(&quot;SELECT * FROM global_temp.people&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df2.show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> Example 2: Attempting to create a duplicate global temporary view</span>
<span class="sd"> &gt;&gt;&gt; df.createGlobalTempView(&quot;people&quot;) # doctest: +IGNORE_EXCEPTION_DETAIL</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> AnalysisException: &quot;Temporary table &#39;people&#39; already exists;&quot;</span>
<span class="sd"> Example 3: Dropping a global temporary view</span>
<span class="sd"> &gt;&gt;&gt; spark.catalog.dropGlobalTempView(&quot;people&quot;)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.createOrReplaceGlobalTempView"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.createOrReplaceGlobalTempView.html#pyspark.sql.DataFrame.createOrReplaceGlobalTempView">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">createOrReplaceGlobalTempView</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Creates or replaces a global temporary view using the given name.</span>
<span class="sd"> The lifetime of this temporary view is tied to this Spark application.</span>
<span class="sd"> .. versionadded:: 2.2.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> name : str</span>
<span class="sd"> Name of the view.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Example 1: Creating a global temporary view with a DataFrame</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.createOrReplaceGlobalTempView(&quot;people&quot;)</span>
<span class="sd"> Example 2: Replacing a global temporary view with a filtered DataFrame</span>
<span class="sd"> &gt;&gt;&gt; df2 = df.filter(df.age &gt; 3)</span>
<span class="sd"> &gt;&gt;&gt; df2.createOrReplaceGlobalTempView(&quot;people&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df3 = spark.table(&quot;global_temp.people&quot;)</span>
<span class="sd"> &gt;&gt;&gt; sorted(df3.collect()) == sorted(df2.collect())</span>
<span class="sd"> True</span>
<span class="sd"> Example 3: Dropping a global temporary view</span>
<span class="sd"> &gt;&gt;&gt; spark.catalog.dropGlobalTempView(&quot;people&quot;)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">write</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrameWriter</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Interface for saving the content of the non-streaming :class:`DataFrame` out into external</span>
<span class="sd"> storage.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrameWriter`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; type(df.write)</span>
<span class="sd"> &lt;class &#39;...readwriter.DataFrameWriter&#39;&gt;</span>
<span class="sd"> Write the DataFrame as a table.</span>
<span class="sd"> &gt;&gt;&gt; _ = spark.sql(&quot;DROP TABLE IF EXISTS tab2&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.write.saveAsTable(&quot;tab2&quot;)</span>
<span class="sd"> &gt;&gt;&gt; _ = spark.sql(&quot;DROP TABLE tab2&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">writeStream</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataStreamWriter</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Interface for saving the content of the streaming :class:`DataFrame` out into external</span>
<span class="sd"> storage.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.5.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This API is evolving.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataStreamWriter`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import time</span>
<span class="sd"> &gt;&gt;&gt; import tempfile</span>
<span class="sd"> &gt;&gt;&gt; df = spark.readStream.format(&quot;rate&quot;).load()</span>
<span class="sd"> &gt;&gt;&gt; type(df.writeStream)</span>
<span class="sd"> &lt;class &#39;...streaming.readwriter.DataStreamWriter&#39;&gt;</span>
<span class="sd"> &gt;&gt;&gt; with tempfile.TemporaryDirectory(prefix=&quot;writeStream&quot;) as d:</span>
<span class="sd"> ... # Create a table with Rate source.</span>
<span class="sd"> ... query = df.writeStream.toTable(</span>
<span class="sd"> ... &quot;my_table&quot;, checkpointLocation=d)</span>
<span class="sd"> ... time.sleep(3)</span>
<span class="sd"> ... query.stop()</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">schema</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">StructType</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the schema of this :class:`DataFrame` as a :class:`pyspark.sql.types.StructType`.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`StructType`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Example 1: Retrieve the inferred schema of the current DataFrame.</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.schema</span>
<span class="sd"> StructType([StructField(&#39;age&#39;, LongType(), True),</span>
<span class="sd"> StructField(&#39;name&#39;, StringType(), True)])</span>
<span class="sd"> Example 2: Retrieve the schema of the current DataFrame (DDL-formatted schema).</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)],</span>
<span class="sd"> ... &quot;age INT, name STRING&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.schema</span>
<span class="sd"> StructType([StructField(&#39;age&#39;, IntegerType(), True),</span>
<span class="sd"> StructField(&#39;name&#39;, StringType(), True)])</span>
<span class="sd"> Example 3: Retrieve the specified schema of the current DataFrame.</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.types import StructType, StructField, StringType</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(&quot;a&quot;,), (&quot;b&quot;,), (&quot;c&quot;,)],</span>
<span class="sd"> ... StructType([StructField(&quot;value&quot;, StringType(), False)]))</span>
<span class="sd"> &gt;&gt;&gt; df.schema</span>
<span class="sd"> StructType([StructField(&#39;value&#39;, StringType(), False)])</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.printSchema"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.printSchema.html#pyspark.sql.DataFrame.printSchema">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">printSchema</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">level</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Prints out the schema in the tree format.</span>
<span class="sd"> Optionally allows to specify how many levels to print if schema is nested.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> level : int, optional</span>
<span class="sd"> How many levels to print for nested schemas.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Example 1: Printing the schema of a DataFrame with basic columns</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.printSchema()</span>
<span class="sd"> root</span>
<span class="sd"> |-- age: long (nullable = true)</span>
<span class="sd"> |-- name: string (nullable = true)</span>
<span class="sd"> Example 2: Printing the schema with a specified level for nested columns</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, (2, 2))], [&quot;a&quot;, &quot;b&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.printSchema(1)</span>
<span class="sd"> root</span>
<span class="sd"> |-- a: long (nullable = true)</span>
<span class="sd"> |-- b: struct (nullable = true)</span>
<span class="sd"> Example 3: Printing the schema with deeper nesting level</span>
<span class="sd"> &gt;&gt;&gt; df.printSchema(2)</span>
<span class="sd"> root</span>
<span class="sd"> |-- a: long (nullable = true)</span>
<span class="sd"> |-- b: struct (nullable = true)</span>
<span class="sd"> | |-- _1: long (nullable = true)</span>
<span class="sd"> | |-- _2: long (nullable = true)</span>
<span class="sd"> Example 4: Printing the schema of a DataFrame with nullable and non-nullable columns</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1).selectExpr(&quot;id AS nonnullable&quot;, &quot;NULL AS nullable&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.printSchema()</span>
<span class="sd"> root</span>
<span class="sd"> |-- nonnullable: long (nullable = false)</span>
<span class="sd"> |-- nullable: void (nullable = true)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.explain"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.explain.html#pyspark.sql.DataFrame.explain">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">explain</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">extended</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">bool</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">mode</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Prints the (logical and physical) plans to the console for debugging purposes.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> extended : bool, optional</span>
<span class="sd"> default ``False``. If ``False``, prints only the physical plan.</span>
<span class="sd"> When this is a string without specifying the ``mode``, it works as the mode is</span>
<span class="sd"> specified.</span>
<span class="sd"> mode : str, optional</span>
<span class="sd"> specifies the expected output format of plans.</span>
<span class="sd"> * ``simple``: Print only a physical plan.</span>
<span class="sd"> * ``extended``: Print both logical and physical plans.</span>
<span class="sd"> * ``codegen``: Print a physical plan and generated codes if they are available.</span>
<span class="sd"> * ``cost``: Print a logical plan and statistics if they are available.</span>
<span class="sd"> * ``formatted``: Split explain output into two sections: a physical plan outline \</span>
<span class="sd"> and node details.</span>
<span class="sd"> .. versionchanged:: 3.0.0</span>
<span class="sd"> Added optional argument `mode` to specify the expected output format of plans.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Example 1: Print out the physical plan only (default).</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.explain() # doctest: +SKIP</span>
<span class="sd"> == Physical Plan ==</span>
<span class="sd"> *(1) Scan ExistingRDD[age...,name...]</span>
<span class="sd"> Example 2: Print out all parsed, analyzed, optimized, and physical plans.</span>
<span class="sd"> &gt;&gt;&gt; df.explain(extended=True)</span>
<span class="sd"> == Parsed Logical Plan ==</span>
<span class="sd"> ...</span>
<span class="sd"> == Analyzed Logical Plan ==</span>
<span class="sd"> ...</span>
<span class="sd"> == Optimized Logical Plan ==</span>
<span class="sd"> ...</span>
<span class="sd"> == Physical Plan ==</span>
<span class="sd"> ...</span>
<span class="sd"> Example 3: Print out the plans with two sections: a physical plan outline and node details.</span>
<span class="sd"> &gt;&gt;&gt; df.explain(mode=&quot;formatted&quot;) # doctest: +SKIP</span>
<span class="sd"> == Physical Plan ==</span>
<span class="sd"> * Scan ExistingRDD (...)</span>
<span class="sd"> (1) Scan ExistingRDD [codegen id : ...]</span>
<span class="sd"> Output [2]: [age..., name...]</span>
<span class="sd"> ...</span>
<span class="sd"> Example 4: Print a logical plan and statistics if they are available.</span>
<span class="sd"> &gt;&gt;&gt; df.explain(mode=&quot;cost&quot;)</span>
<span class="sd"> == Optimized Logical Plan ==</span>
<span class="sd"> ...Statistics...</span>
<span class="sd"> ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.exceptAll"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.exceptAll.html#pyspark.sql.DataFrame.exceptAll">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">exceptAll</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Return a new :class:`DataFrame` containing rows in this :class:`DataFrame` but</span>
<span class="sd"> not in another :class:`DataFrame` while preserving duplicates.</span>
<span class="sd"> This is equivalent to `EXCEPT ALL` in SQL.</span>
<span class="sd"> As standard in SQL, this function resolves columns by position (not by name).</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> other : :class:`DataFrame`</span>
<span class="sd"> The other :class:`DataFrame` to compare to.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame(</span>
<span class="sd"> ... [(&quot;a&quot;, 1), (&quot;a&quot;, 1), (&quot;a&quot;, 1), (&quot;a&quot;, 2), (&quot;b&quot;, 3), (&quot;c&quot;, 4)], [&quot;C1&quot;, &quot;C2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(&quot;a&quot;, 1), (&quot;b&quot;, 3)], [&quot;C1&quot;, &quot;C2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df1.exceptAll(df2).show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | C1| C2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | a| 1|</span>
<span class="sd"> | a| 1|</span>
<span class="sd"> | a| 2|</span>
<span class="sd"> | c| 4|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.isLocal"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.isLocal.html#pyspark.sql.DataFrame.isLocal">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">isLocal</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns ``True`` if the :func:`collect` and :func:`take` methods can be run locally</span>
<span class="sd"> (without any Spark executors).</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> bool</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.sql(&quot;SHOW TABLES&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.isLocal()</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">isStreaming</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns ``True`` if this :class:`DataFrame` contains one or more sources that</span>
<span class="sd"> continuously return data as it arrives. A :class:`DataFrame` that reads data from a</span>
<span class="sd"> streaming source must be executed as a :class:`StreamingQuery` using the :func:`start`</span>
<span class="sd"> method in :class:`DataStreamWriter`. Methods that return a single answer, (e.g.,</span>
<span class="sd"> :func:`count` or :func:`collect`) will throw an :class:`AnalysisException` when there</span>
<span class="sd"> is a streaming source present.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This API is evolving.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> bool</span>
<span class="sd"> Whether it&#39;s streaming DataFrame or not.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.readStream.format(&quot;rate&quot;).load()</span>
<span class="sd"> &gt;&gt;&gt; df.isStreaming</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.isEmpty"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.isEmpty.html#pyspark.sql.DataFrame.isEmpty">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">isEmpty</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Checks if the :class:`DataFrame` is empty and returns a boolean value.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> bool</span>
<span class="sd"> Returns ``True`` if the DataFrame is empty, ``False`` otherwise.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.count : Counts the number of rows in DataFrame.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> - Unlike `count()`, this method does not trigger any computation.</span>
<span class="sd"> - An empty DataFrame has no rows. It may have columns, but no data.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Example 1: Checking if an empty DataFrame is empty</span>
<span class="sd"> &gt;&gt;&gt; df_empty = spark.createDataFrame([], &#39;a STRING&#39;)</span>
<span class="sd"> &gt;&gt;&gt; df_empty.isEmpty()</span>
<span class="sd"> True</span>
<span class="sd"> Example 2: Checking if a non-empty DataFrame is empty</span>
<span class="sd"> &gt;&gt;&gt; df_non_empty = spark.createDataFrame([&quot;a&quot;], &#39;STRING&#39;)</span>
<span class="sd"> &gt;&gt;&gt; df_non_empty.isEmpty()</span>
<span class="sd"> False</span>
<span class="sd"> Example 3: Checking if a DataFrame with null values is empty</span>
<span class="sd"> &gt;&gt;&gt; df_nulls = spark.createDataFrame([(None, None)], &#39;a STRING, b INT&#39;)</span>
<span class="sd"> &gt;&gt;&gt; df_nulls.isEmpty()</span>
<span class="sd"> False</span>
<span class="sd"> Example 4: Checking if a DataFrame with no rows but with columns is empty</span>
<span class="sd"> &gt;&gt;&gt; df_no_rows = spark.createDataFrame([], &#39;id INT, value STRING&#39;)</span>
<span class="sd"> &gt;&gt;&gt; df_no_rows.isEmpty()</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.show"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.show.html#pyspark.sql.DataFrame.show">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">show</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span> <span class="n">truncate</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">bool</span><span class="p">,</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">vertical</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Prints the first ``n`` rows of the DataFrame to the console.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> n : int, optional, default 20</span>
<span class="sd"> Number of rows to show.</span>
<span class="sd"> truncate : bool or int, optional, default True</span>
<span class="sd"> If set to ``True``, truncate strings longer than 20 chars.</span>
<span class="sd"> If set to a number greater than one, truncates long strings to length ``truncate``</span>
<span class="sd"> and align cells right.</span>
<span class="sd"> vertical : bool, optional</span>
<span class="sd"> If set to ``True``, print output rows vertically (one line per column value).</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;), (19, &quot;This is a super long name&quot;)],</span>
<span class="sd"> ... [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> Show :class:`DataFrame`</span>
<span class="sd"> &gt;&gt;&gt; df.show()</span>
<span class="sd"> +---+--------------------+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+--------------------+</span>
<span class="sd"> | 14| Tom|</span>
<span class="sd"> | 23| Alice|</span>
<span class="sd"> | 16| Bob|</span>
<span class="sd"> | 19|This is a super l...|</span>
<span class="sd"> +---+--------------------+</span>
<span class="sd"> Show only top 2 rows.</span>
<span class="sd"> &gt;&gt;&gt; df.show(2)</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 14| Tom|</span>
<span class="sd"> | 23|Alice|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> only showing top 2 rows</span>
<span class="sd"> Show full column content without truncation.</span>
<span class="sd"> &gt;&gt;&gt; df.show(truncate=False)</span>
<span class="sd"> +---+-------------------------+</span>
<span class="sd"> |age|name |</span>
<span class="sd"> +---+-------------------------+</span>
<span class="sd"> |14 |Tom |</span>
<span class="sd"> |23 |Alice |</span>
<span class="sd"> |16 |Bob |</span>
<span class="sd"> |19 |This is a super long name|</span>
<span class="sd"> +---+-------------------------+</span>
<span class="sd"> Show :class:`DataFrame` where the maximum number of characters is 3.</span>
<span class="sd"> &gt;&gt;&gt; df.show(truncate=3)</span>
<span class="sd"> +---+----+</span>
<span class="sd"> |age|name|</span>
<span class="sd"> +---+----+</span>
<span class="sd"> | 14| Tom|</span>
<span class="sd"> | 23| Ali|</span>
<span class="sd"> | 16| Bob|</span>
<span class="sd"> | 19| Thi|</span>
<span class="sd"> +---+----+</span>
<span class="sd"> Show :class:`DataFrame` vertically.</span>
<span class="sd"> &gt;&gt;&gt; df.show(vertical=True)</span>
<span class="sd"> -RECORD 0--------------------</span>
<span class="sd"> age | 14</span>
<span class="sd"> name | Tom</span>
<span class="sd"> -RECORD 1--------------------</span>
<span class="sd"> age | 23</span>
<span class="sd"> name | Alice</span>
<span class="sd"> -RECORD 2--------------------</span>
<span class="sd"> age | 16</span>
<span class="sd"> name | Bob</span>
<span class="sd"> -RECORD 3--------------------</span>
<span class="sd"> age | 19</span>
<span class="sd"> name | This is a super l...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">_repr_html_</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a :class:`DataFrame` with html code when you enabled eager evaluation</span>
<span class="sd"> by &#39;spark.sql.repl.eagerEval.enabled&#39;, this only called by REPL you are</span>
<span class="sd"> using support eager evaluation with HTML.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.checkpoint"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.checkpoint.html#pyspark.sql.DataFrame.checkpoint">[docs]</a> <span class="k">def</span> <span class="nf">checkpoint</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">eager</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a checkpointed version of this :class:`DataFrame`. Checkpointing can be</span>
<span class="sd"> used to truncate the logical plan of this :class:`DataFrame`, which is especially</span>
<span class="sd"> useful in iterative algorithms where the plan may grow exponentially. It will be</span>
<span class="sd"> saved to files inside the checkpoint directory set with</span>
<span class="sd"> :meth:`SparkContext.setCheckpointDir`, or `spark.checkpoint.dir` configuration.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> .. versionchanged:: 4.0.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> eager : bool, optional, default True</span>
<span class="sd"> Whether to checkpoint this :class:`DataFrame` immediately.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Checkpointed DataFrame.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This API is experimental.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.checkpoint(False) # doctest: +SKIP</span>
<span class="sd"> DataFrame[age: bigint, name: string]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.localCheckpoint"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.localCheckpoint.html#pyspark.sql.DataFrame.localCheckpoint">[docs]</a> <span class="k">def</span> <span class="nf">localCheckpoint</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">eager</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a locally checkpointed version of this :class:`DataFrame`. Checkpointing can</span>
<span class="sd"> be used to truncate the logical plan of this :class:`DataFrame`, which is especially</span>
<span class="sd"> useful in iterative algorithms where the plan may grow exponentially. Local checkpoints</span>
<span class="sd"> are stored in the executors using the caching subsystem and therefore they are not</span>
<span class="sd"> reliable.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> .. versionchanged:: 4.0.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> eager : bool, optional, default True</span>
<span class="sd"> Whether to checkpoint this :class:`DataFrame` immediately.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Checkpointed DataFrame.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This API is experimental.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.localCheckpoint(False)</span>
<span class="sd"> DataFrame[age: bigint, name: string]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.withWatermark"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.withWatermark.html#pyspark.sql.DataFrame.withWatermark">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">withWatermark</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">eventTime</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">delayThreshold</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Defines an event time watermark for this :class:`DataFrame`. A watermark tracks a point</span>
<span class="sd"> in time before which we assume no more late data is going to arrive.</span>
<span class="sd"> Spark will use this watermark for several purposes:</span>
<span class="sd"> - To know when a given time window aggregation can be finalized and thus can be emitted</span>
<span class="sd"> when using output modes that do not allow updates.</span>
<span class="sd"> - To minimize the amount of state that we need to keep for on-going aggregations.</span>
<span class="sd"> The current watermark is computed by looking at the `MAX(eventTime)` seen across</span>
<span class="sd"> all of the partitions in the query minus a user specified `delayThreshold`. Due to the cost</span>
<span class="sd"> of coordinating this value across partitions, the actual watermark used is only guaranteed</span>
<span class="sd"> to be at least `delayThreshold` behind the actual event time. In some cases we may still</span>
<span class="sd"> process records that arrive more than `delayThreshold` late.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> .. versionchanged:: 3.5.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> eventTime : str</span>
<span class="sd"> the name of the column that contains the event time of the row.</span>
<span class="sd"> delayThreshold : str</span>
<span class="sd"> the minimum delay to wait to data to arrive late, relative to the</span>
<span class="sd"> latest record that has been processed in the form of an interval</span>
<span class="sd"> (e.g. &quot;1 minute&quot; or &quot;5 hours&quot;).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Watermarked DataFrame</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This is a feature only for Structured Streaming.</span>
<span class="sd"> This API is evolving.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import timestamp_seconds</span>
<span class="sd"> &gt;&gt;&gt; df = spark.readStream.format(&quot;rate&quot;).load().selectExpr(</span>
<span class="sd"> ... &quot;value % 5 AS value&quot;, &quot;timestamp&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(&quot;value&quot;, df.timestamp.alias(&quot;time&quot;)).withWatermark(&quot;time&quot;, &#39;10 minutes&#39;)</span>
<span class="sd"> DataFrame[value: bigint, time: timestamp]</span>
<span class="sd"> Group the data by window and value (0 - 4), and compute the count of each group.</span>
<span class="sd"> &gt;&gt;&gt; import time</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import window</span>
<span class="sd"> &gt;&gt;&gt; query = (df</span>
<span class="sd"> ... .withWatermark(&quot;timestamp&quot;, &quot;10 minutes&quot;)</span>
<span class="sd"> ... .groupBy(</span>
<span class="sd"> ... window(df.timestamp, &quot;10 minutes&quot;, &quot;5 minutes&quot;),</span>
<span class="sd"> ... df.value)</span>
<span class="sd"> ... ).count().writeStream.outputMode(&quot;complete&quot;).format(&quot;console&quot;).start()</span>
<span class="sd"> &gt;&gt;&gt; time.sleep(3)</span>
<span class="sd"> &gt;&gt;&gt; query.stop()</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.hint"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.hint.html#pyspark.sql.DataFrame.hint">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">hint</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="o">*</span><span class="n">parameters</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;PrimitiveType&quot;</span><span class="p">,</span> <span class="s2">&quot;Column&quot;</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;PrimitiveType&quot;</span><span class="p">]]</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Specifies some hint on the current :class:`DataFrame`.</span>
<span class="sd"> .. versionadded:: 2.2.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> name : str</span>
<span class="sd"> A name of the hint.</span>
<span class="sd"> parameters : str, list, float or int</span>
<span class="sd"> Optional parameters.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Hinted DataFrame</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([Row(height=80, name=&quot;Tom&quot;), Row(height=85, name=&quot;Bob&quot;)])</span>
<span class="sd"> &gt;&gt;&gt; df.join(df2, &quot;name&quot;).explain() # doctest: +SKIP</span>
<span class="sd"> == Physical Plan ==</span>
<span class="sd"> ...</span>
<span class="sd"> ... +- SortMergeJoin ...</span>
<span class="sd"> ...</span>
<span class="sd"> Explicitly trigger the broadcast hashjoin by providing the hint in ``df2``.</span>
<span class="sd"> &gt;&gt;&gt; df.join(df2.hint(&quot;broadcast&quot;), &quot;name&quot;).explain()</span>
<span class="sd"> == Physical Plan ==</span>
<span class="sd"> ...</span>
<span class="sd"> ... +- BroadcastHashJoin ...</span>
<span class="sd"> ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.count"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.count.html#pyspark.sql.DataFrame.count">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">count</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the number of rows in this :class:`DataFrame`.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> int</span>
<span class="sd"> Number of rows.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> Return the number of rows in the :class:`DataFrame`.</span>
<span class="sd"> &gt;&gt;&gt; df.count()</span>
<span class="sd"> 3</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.collect"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.collect.html#pyspark.sql.DataFrame.collect">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">collect</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Row</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns all the records in the DataFrame as a list of :class:`Row`.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> list</span>
<span class="sd"> A list of :class:`Row` objects, each representing a row in the DataFrame.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.take : Returns the first `n` rows.</span>
<span class="sd"> DataFrame.head : Returns the first `n` rows.</span>
<span class="sd"> DataFrame.toPandas : Returns the data as a pandas DataFrame.</span>
<span class="sd"> DataFrame.toArrow : Returns the data as a PyArrow Table.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This method should only be used if the resulting list is expected to be small,</span>
<span class="sd"> as all the data is loaded into the driver&#39;s memory.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Example: Collecting all rows of a DataFrame</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.collect()</span>
<span class="sd"> [Row(age=14, name=&#39;Tom&#39;), Row(age=23, name=&#39;Alice&#39;), Row(age=16, name=&#39;Bob&#39;)]</span>
<span class="sd"> Example: Collecting all rows after filtering</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.filter(df.age &gt; 15).collect()</span>
<span class="sd"> [Row(age=23, name=&#39;Alice&#39;), Row(age=16, name=&#39;Bob&#39;)]</span>
<span class="sd"> Example: Collecting all rows after selecting specific columns</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(&quot;name&quot;).collect()</span>
<span class="sd"> [Row(name=&#39;Tom&#39;), Row(name=&#39;Alice&#39;), Row(name=&#39;Bob&#39;)]</span>
<span class="sd"> Example: Collecting all rows after applying a function to a column</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import upper</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(upper(df.name)).collect()</span>
<span class="sd"> [Row(upper(name)=&#39;TOM&#39;), Row(upper(name)=&#39;ALICE&#39;), Row(upper(name)=&#39;BOB&#39;)]</span>
<span class="sd"> Example: Collecting all rows from a DataFrame and converting a specific column to a list</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; rows = df.collect()</span>
<span class="sd"> &gt;&gt;&gt; [row[&quot;name&quot;] for row in rows]</span>
<span class="sd"> [&#39;Tom&#39;, &#39;Alice&#39;, &#39;Bob&#39;]</span>
<span class="sd"> Example: Collecting all rows from a DataFrame and converting to a list of dictionaries</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; rows = df.collect()</span>
<span class="sd"> &gt;&gt;&gt; [row.asDict() for row in rows]</span>
<span class="sd"> [{&#39;age&#39;: 14, &#39;name&#39;: &#39;Tom&#39;}, {&#39;age&#39;: 23, &#39;name&#39;: &#39;Alice&#39;}, {&#39;age&#39;: 16, &#39;name&#39;: &#39;Bob&#39;}]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.toLocalIterator"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.toLocalIterator.html#pyspark.sql.DataFrame.toLocalIterator">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">toLocalIterator</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">prefetchPartitions</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Row</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns an iterator that contains all of the rows in this :class:`DataFrame`.</span>
<span class="sd"> The iterator will consume as much memory as the largest partition in this</span>
<span class="sd"> :class:`DataFrame`. With prefetch it may consume up to the memory of the 2 largest</span>
<span class="sd"> partitions.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> prefetchPartitions : bool, optional</span>
<span class="sd"> If Spark should pre-fetch the next partition before it is needed.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> This argument does not take effect for Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> Iterator</span>
<span class="sd"> Iterator of rows.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; list(df.toLocalIterator())</span>
<span class="sd"> [Row(age=14, name=&#39;Tom&#39;), Row(age=23, name=&#39;Alice&#39;), Row(age=16, name=&#39;Bob&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.limit"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.limit.html#pyspark.sql.DataFrame.limit">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">limit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">num</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Limits the result count to the number specified.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> num : int</span>
<span class="sd"> Number of records to return. Will return this number of records</span>
<span class="sd"> or all records if the DataFrame contains less than this number of records.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Subset of the records</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.limit(1).show()</span>
<span class="sd"> +---+----+</span>
<span class="sd"> |age|name|</span>
<span class="sd"> +---+----+</span>
<span class="sd"> | 14| Tom|</span>
<span class="sd"> +---+----+</span>
<span class="sd"> &gt;&gt;&gt; df.limit(0).show()</span>
<span class="sd"> +---+----+</span>
<span class="sd"> |age|name|</span>
<span class="sd"> +---+----+</span>
<span class="sd"> +---+----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.offset"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.offset.html#pyspark.sql.DataFrame.offset">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">offset</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">num</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a new :class: `DataFrame` by skipping the first `n` rows.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> .. versionchanged:: 3.5.0</span>
<span class="sd"> Supports vanilla PySpark.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> num : int</span>
<span class="sd"> Number of records to skip.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Subset of the records</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.offset(1).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 23|Alice|</span>
<span class="sd"> | 16| Bob|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &gt;&gt;&gt; df.offset(10).show()</span>
<span class="sd"> +---+----+</span>
<span class="sd"> |age|name|</span>
<span class="sd"> +---+----+</span>
<span class="sd"> +---+----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.take"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.take.html#pyspark.sql.DataFrame.take">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">take</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">num</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Row</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the first ``num`` rows as a :class:`list` of :class:`Row`.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> num : int</span>
<span class="sd"> Number of records to return. Will return this number of records</span>
<span class="sd"> or all records if the DataFrame contains less than this number of records..</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> list</span>
<span class="sd"> List of rows</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> Return the first 2 rows of the :class:`DataFrame`.</span>
<span class="sd"> &gt;&gt;&gt; df.take(2)</span>
<span class="sd"> [Row(age=14, name=&#39;Tom&#39;), Row(age=23, name=&#39;Alice&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.tail"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.tail.html#pyspark.sql.DataFrame.tail">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">tail</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">num</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Row</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the last ``num`` rows as a :class:`list` of :class:`Row`.</span>
<span class="sd"> Running tail requires moving data into the application&#39;s driver process, and doing so with</span>
<span class="sd"> a very large ``num`` can crash the driver process with OutOfMemoryError.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> num : int</span>
<span class="sd"> Number of records to return. Will return this number of records</span>
<span class="sd"> or all records if the DataFrame contains less than this number of records.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> list</span>
<span class="sd"> List of rows</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.tail(2)</span>
<span class="sd"> [Row(age=23, name=&#39;Alice&#39;), Row(age=16, name=&#39;Bob&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.foreach"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.foreach.html#pyspark.sql.DataFrame.foreach">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">foreach</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Row</span><span class="p">],</span> <span class="kc">None</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Applies the ``f`` function to all :class:`Row` of this :class:`DataFrame`.</span>
<span class="sd"> This is a shorthand for ``df.rdd.foreach()``.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 4.0.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> f : function</span>
<span class="sd"> A function that accepts one parameter which will</span>
<span class="sd"> receive each row to process.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; def func(person):</span>
<span class="sd"> ... print(person.name)</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; df.foreach(func)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.foreachPartition"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.foreachPartition.html#pyspark.sql.DataFrame.foreachPartition">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">foreachPartition</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Iterator</span><span class="p">[</span><span class="n">Row</span><span class="p">]],</span> <span class="kc">None</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Applies the ``f`` function to each partition of this :class:`DataFrame`.</span>
<span class="sd"> This a shorthand for ``df.rdd.foreachPartition()``.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 4.0.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> f : function</span>
<span class="sd"> A function that accepts one parameter which will receive</span>
<span class="sd"> each partition to process.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; def func(itr):</span>
<span class="sd"> ... for person in itr:</span>
<span class="sd"> ... print(person.name)</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; df.foreachPartition(func)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.cache"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.cache.html#pyspark.sql.DataFrame.cache">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">cache</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Persists the :class:`DataFrame` with the default storage level (`MEMORY_AND_DISK_DESER`).</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The default storage level has changed to `MEMORY_AND_DISK_DESER` to match Scala in 3.0.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Cached DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.cache()</span>
<span class="sd"> DataFrame[id: bigint]</span>
<span class="sd"> &gt;&gt;&gt; df.explain()</span>
<span class="sd"> == Physical Plan ==</span>
<span class="sd"> InMemoryTableScan ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.persist"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.persist.html#pyspark.sql.DataFrame.persist">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">persist</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">storageLevel</span><span class="p">:</span> <span class="n">StorageLevel</span> <span class="o">=</span> <span class="p">(</span><span class="n">StorageLevel</span><span class="o">.</span><span class="n">MEMORY_AND_DISK_DESER</span><span class="p">),</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Sets the storage level to persist the contents of the :class:`DataFrame` across</span>
<span class="sd"> operations after the first time it is computed. This can only be used to assign</span>
<span class="sd"> a new storage level if the :class:`DataFrame` does not have a storage level set yet.</span>
<span class="sd"> If no storage level is specified defaults to (`MEMORY_AND_DISK_DESER`)</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The default storage level has changed to `MEMORY_AND_DISK_DESER` to match Scala in 3.0.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> storageLevel : :class:`StorageLevel`</span>
<span class="sd"> Storage level to set for persistence. Default is MEMORY_AND_DISK_DESER.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Persisted DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.persist()</span>
<span class="sd"> DataFrame[id: bigint]</span>
<span class="sd"> &gt;&gt;&gt; df.explain()</span>
<span class="sd"> == Physical Plan ==</span>
<span class="sd"> InMemoryTableScan ...</span>
<span class="sd"> Persists the data in the disk by specifying the storage level.</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.storagelevel import StorageLevel</span>
<span class="sd"> &gt;&gt;&gt; df.persist(StorageLevel.DISK_ONLY)</span>
<span class="sd"> DataFrame[id: bigint]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">storageLevel</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">StorageLevel</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Get the :class:`DataFrame`&#39;s current storage level.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`StorageLevel`</span>
<span class="sd"> Currently defined storage level.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.range(10)</span>
<span class="sd"> &gt;&gt;&gt; df1.storageLevel</span>
<span class="sd"> StorageLevel(False, False, False, False, 1)</span>
<span class="sd"> &gt;&gt;&gt; df1.cache().storageLevel</span>
<span class="sd"> StorageLevel(True, True, False, True, 1)</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.range(5)</span>
<span class="sd"> &gt;&gt;&gt; df2.persist(StorageLevel.DISK_ONLY_2).storageLevel</span>
<span class="sd"> StorageLevel(True, False, False, False, 2)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.unpersist"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.unpersist.html#pyspark.sql.DataFrame.unpersist">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">unpersist</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">blocking</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Marks the :class:`DataFrame` as non-persistent, and remove all blocks for it from</span>
<span class="sd"> memory and disk.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> `blocking` default has changed to ``False`` to match Scala in 2.0.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> blocking : bool</span>
<span class="sd"> Whether to block until all blocks are deleted.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Unpersisted DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.persist()</span>
<span class="sd"> DataFrame[id: bigint]</span>
<span class="sd"> &gt;&gt;&gt; df.unpersist()</span>
<span class="sd"> DataFrame[id: bigint]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(1)</span>
<span class="sd"> &gt;&gt;&gt; df.unpersist(True)</span>
<span class="sd"> DataFrame[id: bigint]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">is_cached</span> <span class="o">=</span> <span class="kc">False</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">unpersist</span><span class="p">(</span><span class="n">blocking</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span></div>
<div class="viewcode-block" id="DataFrame.coalesce"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.coalesce.html#pyspark.sql.DataFrame.coalesce">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">coalesce</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a new :class:`DataFrame` that has exactly `numPartitions` partitions.</span>
<span class="sd"> Similar to coalesce defined on an :class:`RDD`, this operation results in a</span>
<span class="sd"> narrow dependency, e.g. if you go from 1000 partitions to 100 partitions,</span>
<span class="sd"> there will not be a shuffle, instead each of the 100 new partitions will</span>
<span class="sd"> claim 10 of the current partitions. If a larger number of partitions is requested,</span>
<span class="sd"> it will stay at the current number of partitions.</span>
<span class="sd"> However, if you&#39;re doing a drastic coalesce, e.g. to numPartitions = 1,</span>
<span class="sd"> this may result in your computation taking place on fewer nodes than</span>
<span class="sd"> you like (e.g. one node in the case of numPartitions = 1). To avoid this,</span>
<span class="sd"> you can call repartition(). This will add a shuffle step, but means the</span>
<span class="sd"> current upstream partitions will be executed in parallel (per whatever</span>
<span class="sd"> the current partitioning is).</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> numPartitions : int</span>
<span class="sd"> specify the target number of partitions</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(0, 10, 1, 3).select(</span>
<span class="sd"> ... sf.spark_partition_id().alias(&quot;partition&quot;)</span>
<span class="sd"> ... ).distinct().sort(&quot;partition&quot;).show()</span>
<span class="sd"> +---------+</span>
<span class="sd"> |partition|</span>
<span class="sd"> +---------+</span>
<span class="sd"> | 0|</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 2|</span>
<span class="sd"> +---------+</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.range(0, 10, 1, 3).coalesce(1).select(</span>
<span class="sd"> ... sf.spark_partition_id().alias(&quot;partition&quot;)</span>
<span class="sd"> ... ).distinct().sort(&quot;partition&quot;).show()</span>
<span class="sd"> +---------+</span>
<span class="sd"> |partition|</span>
<span class="sd"> +---------+</span>
<span class="sd"> | 0|</span>
<span class="sd"> +---------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sparkSession</span><span class="p">)</span></div>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">repartition</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">repartition</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.repartition"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.repartition.html#pyspark.sql.DataFrame.repartition">[docs]</a> <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span>
<span class="k">def</span> <span class="nf">repartition</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">],</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a new :class:`DataFrame` partitioned by the given partitioning expressions. The</span>
<span class="sd"> resulting :class:`DataFrame` is hash partitioned.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> numPartitions : int</span>
<span class="sd"> can be an int to specify the target number of partitions or a Column.</span>
<span class="sd"> If it is a Column, it will be used as the first partitioning column. If not specified,</span>
<span class="sd"> the default number of partitions is used.</span>
<span class="sd"> cols : str or :class:`Column`</span>
<span class="sd"> partitioning columns.</span>
<span class="sd"> .. versionchanged:: 1.6.0</span>
<span class="sd"> Added optional arguments to specify the partitioning columns. Also made numPartitions</span>
<span class="sd"> optional if partitioning columns are specified.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Repartitioned DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import functions as sf</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(0, 64, 1, 9).withColumn(</span>
<span class="sd"> ... &quot;name&quot;, sf.concat(sf.lit(&quot;name_&quot;), sf.col(&quot;id&quot;).cast(&quot;string&quot;))</span>
<span class="sd"> ... ).withColumn(</span>
<span class="sd"> ... &quot;age&quot;, sf.col(&quot;id&quot;) - 32</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(</span>
<span class="sd"> ... sf.spark_partition_id().alias(&quot;partition&quot;)</span>
<span class="sd"> ... ).distinct().sort(&quot;partition&quot;).show()</span>
<span class="sd"> +---------+</span>
<span class="sd"> |partition|</span>
<span class="sd"> +---------+</span>
<span class="sd"> | 0|</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 2|</span>
<span class="sd"> | 3|</span>
<span class="sd"> | 4|</span>
<span class="sd"> | 5|</span>
<span class="sd"> | 6|</span>
<span class="sd"> | 7|</span>
<span class="sd"> | 8|</span>
<span class="sd"> +---------+</span>
<span class="sd"> Repartition the data into 10 partitions.</span>
<span class="sd"> &gt;&gt;&gt; df.repartition(10).select(</span>
<span class="sd"> ... sf.spark_partition_id().alias(&quot;partition&quot;)</span>
<span class="sd"> ... ).distinct().sort(&quot;partition&quot;).show()</span>
<span class="sd"> +---------+</span>
<span class="sd"> |partition|</span>
<span class="sd"> +---------+</span>
<span class="sd"> | 0|</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 2|</span>
<span class="sd"> | 3|</span>
<span class="sd"> | 4|</span>
<span class="sd"> | 5|</span>
<span class="sd"> | 6|</span>
<span class="sd"> | 7|</span>
<span class="sd"> | 8|</span>
<span class="sd"> | 9|</span>
<span class="sd"> +---------+</span>
<span class="sd"> Repartition the data into 7 partitions by &#39;age&#39; column.</span>
<span class="sd"> &gt;&gt;&gt; df.repartition(7, &quot;age&quot;).select(</span>
<span class="sd"> ... sf.spark_partition_id().alias(&quot;partition&quot;)</span>
<span class="sd"> ... ).distinct().sort(&quot;partition&quot;).show()</span>
<span class="sd"> +---------+</span>
<span class="sd"> |partition|</span>
<span class="sd"> +---------+</span>
<span class="sd"> | 0|</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 2|</span>
<span class="sd"> | 3|</span>
<span class="sd"> | 4|</span>
<span class="sd"> | 5|</span>
<span class="sd"> | 6|</span>
<span class="sd"> +---------+</span>
<span class="sd"> Repartition the data into 3 partitions by &#39;age&#39; and &#39;name&#39; columns.</span>
<span class="sd"> &gt;&gt;&gt; df.repartition(3, &quot;name&quot;, &quot;age&quot;).select(</span>
<span class="sd"> ... sf.spark_partition_id().alias(&quot;partition&quot;)</span>
<span class="sd"> ... ).distinct().sort(&quot;partition&quot;).show()</span>
<span class="sd"> +---------+</span>
<span class="sd"> |partition|</span>
<span class="sd"> +---------+</span>
<span class="sd"> | 0|</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 2|</span>
<span class="sd"> +---------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">repartitionByRange</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">repartitionByRange</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.repartitionByRange"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.repartitionByRange.html#pyspark.sql.DataFrame.repartitionByRange">[docs]</a> <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span>
<span class="k">def</span> <span class="nf">repartitionByRange</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">],</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a new :class:`DataFrame` partitioned by the given partitioning expressions. The</span>
<span class="sd"> resulting :class:`DataFrame` is range partitioned.</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> numPartitions : int</span>
<span class="sd"> can be an int to specify the target number of partitions or a Column.</span>
<span class="sd"> If it is a Column, it will be used as the first partitioning column. If not specified,</span>
<span class="sd"> the default number of partitions is used.</span>
<span class="sd"> cols : str or :class:`Column`</span>
<span class="sd"> partitioning columns.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Repartitioned DataFrame.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> At least one partition-by expression must be specified.</span>
<span class="sd"> When no explicit sort order is specified, &quot;ascending nulls first&quot; is assumed.</span>
<span class="sd"> Due to performance reasons this method uses sampling to estimate the ranges.</span>
<span class="sd"> Hence, the output may not be consistent, since sampling can return different values.</span>
<span class="sd"> The sample size can be controlled by the config</span>
<span class="sd"> `spark.sql.execution.rangeExchange.sampleSizePerPartition`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Repartition the data into 2 partitions by range in &#39;age&#39; column.</span>
<span class="sd"> For example, the first partition can have ``(14, &quot;Tom&quot;)`` and ``(16, &quot;Bob&quot;)``,</span>
<span class="sd"> and the second partition would have ``(23, &quot;Alice&quot;)``.</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import functions as sf</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;]</span>
<span class="sd"> ... ).repartitionByRange(2, &quot;age&quot;).select(</span>
<span class="sd"> ... &quot;age&quot;, &quot;name&quot;, sf.spark_partition_id()</span>
<span class="sd"> ... ).show()</span>
<span class="sd"> +---+-----+--------------------+</span>
<span class="sd"> |age| name|SPARK_PARTITION_ID()|</span>
<span class="sd"> +---+-----+--------------------+</span>
<span class="sd"> | 14| Tom| 0|</span>
<span class="sd"> | 16| Bob| 0|</span>
<span class="sd"> | 23|Alice| 1|</span>
<span class="sd"> +---+-----+--------------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.distinct"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.distinct.html#pyspark.sql.DataFrame.distinct">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">distinct</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> DataFrame with distinct records.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.dropDuplicates</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Remove duplicate rows from a DataFrame</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (23, &quot;Alice&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.distinct().show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 14| Tom|</span>
<span class="sd"> | 23|Alice|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> Count the number of distinct rows in a DataFrame</span>
<span class="sd"> &gt;&gt;&gt; df.distinct().count()</span>
<span class="sd"> 2</span>
<span class="sd"> Get distinct rows from a DataFrame with multiple columns</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;, &quot;M&quot;), (23, &quot;Alice&quot;, &quot;F&quot;), (23, &quot;Alice&quot;, &quot;F&quot;), (14, &quot;Tom&quot;, &quot;M&quot;)],</span>
<span class="sd"> ... [&quot;age&quot;, &quot;name&quot;, &quot;gender&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.distinct().show()</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> |age| name|gender|</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> | 14| Tom| M|</span>
<span class="sd"> | 23|Alice| F|</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> Get distinct values from a specific column in a DataFrame</span>
<span class="sd"> &gt;&gt;&gt; df.select(&quot;name&quot;).distinct().show()</span>
<span class="sd"> +-----+</span>
<span class="sd"> | name|</span>
<span class="sd"> +-----+</span>
<span class="sd"> | Tom|</span>
<span class="sd"> |Alice|</span>
<span class="sd"> +-----+</span>
<span class="sd"> Count the number of distinct values in a specific column</span>
<span class="sd"> &gt;&gt;&gt; df.select(&quot;name&quot;).distinct().count()</span>
<span class="sd"> 2</span>
<span class="sd"> Get distinct values from multiple columns in DataFrame</span>
<span class="sd"> &gt;&gt;&gt; df.select(&quot;name&quot;, &quot;gender&quot;).distinct().show()</span>
<span class="sd"> +-----+------+</span>
<span class="sd"> | name|gender|</span>
<span class="sd"> +-----+------+</span>
<span class="sd"> | Tom| M|</span>
<span class="sd"> |Alice| F|</span>
<span class="sd"> +-----+------+</span>
<span class="sd"> Get distinct rows from a DataFrame with null values</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;, &quot;M&quot;), (23, &quot;Alice&quot;, &quot;F&quot;), (23, &quot;Alice&quot;, &quot;F&quot;), (14, &quot;Tom&quot;, None)],</span>
<span class="sd"> ... [&quot;age&quot;, &quot;name&quot;, &quot;gender&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.distinct().show()</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> |age| name|gender|</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> | 14| Tom| M|</span>
<span class="sd"> | 23|Alice| F|</span>
<span class="sd"> | 14| Tom| NULL|</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> Get distinct non-null values from a DataFrame</span>
<span class="sd"> &gt;&gt;&gt; df.distinct().filter(df.gender.isNotNull()).show()</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> |age| name|gender|</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> | 14| Tom| M|</span>
<span class="sd"> | 23|Alice| F|</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">sample</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">fraction</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">sample</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">withReplacement</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">],</span>
<span class="n">fraction</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
<span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.sample"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.sample.html#pyspark.sql.DataFrame.sample">[docs]</a> <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span>
<span class="k">def</span> <span class="nf">sample</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">withReplacement</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">float</span><span class="p">,</span> <span class="nb">bool</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">fraction</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a sampled subset of this :class:`DataFrame`.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> withReplacement : bool, optional</span>
<span class="sd"> Sample with replacement or not (default ``False``).</span>
<span class="sd"> fraction : float, optional</span>
<span class="sd"> Fraction of rows to generate, range [0.0, 1.0].</span>
<span class="sd"> seed : int, optional</span>
<span class="sd"> Seed for sampling (default a random seed).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Sampled rows from given DataFrame.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This is not guaranteed to provide exactly the fraction specified of the total</span>
<span class="sd"> count of the given :class:`DataFrame`.</span>
<span class="sd"> `fraction` is required and, `withReplacement` and `seed` are optional.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(10)</span>
<span class="sd"> &gt;&gt;&gt; df.sample(0.5, 3).count() # doctest: +SKIP</span>
<span class="sd"> 7</span>
<span class="sd"> &gt;&gt;&gt; df.sample(fraction=0.5, seed=3).count() # doctest: +SKIP</span>
<span class="sd"> 7</span>
<span class="sd"> &gt;&gt;&gt; df.sample(withReplacement=True, fraction=0.5, seed=3).count() # doctest: +SKIP</span>
<span class="sd"> 1</span>
<span class="sd"> &gt;&gt;&gt; df.sample(1.0).count()</span>
<span class="sd"> 10</span>
<span class="sd"> &gt;&gt;&gt; df.sample(fraction=1.0).count()</span>
<span class="sd"> 10</span>
<span class="sd"> &gt;&gt;&gt; df.sample(False, fraction=1.0).count()</span>
<span class="sd"> 10</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.sampleBy"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.sampleBy.html#pyspark.sql.DataFrame.sampleBy">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">sampleBy</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">col</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">fractions</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="nb">float</span><span class="p">],</span> <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a stratified sample without replacement based on the</span>
<span class="sd"> fraction given on each stratum.</span>
<span class="sd"> .. versionadded:: 1.5.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col : :class:`Column` or str</span>
<span class="sd"> column that defines strata</span>
<span class="sd"> .. versionchanged:: 3.0.0</span>
<span class="sd"> Added sampling by a column of :class:`Column`</span>
<span class="sd"> fractions : dict</span>
<span class="sd"> sampling fraction for each stratum. If a stratum is not</span>
<span class="sd"> specified, we treat its fraction as zero.</span>
<span class="sd"> seed : int, optional</span>
<span class="sd"> random seed</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> a new :class:`DataFrame` that represents the stratified sample</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import col</span>
<span class="sd"> &gt;&gt;&gt; dataset = spark.range(0, 100).select((col(&quot;id&quot;) % 3).alias(&quot;key&quot;))</span>
<span class="sd"> &gt;&gt;&gt; sampled = dataset.sampleBy(&quot;key&quot;, fractions={0: 0.1, 1: 0.2}, seed=0)</span>
<span class="sd"> &gt;&gt;&gt; sampled.groupBy(&quot;key&quot;).count().orderBy(&quot;key&quot;).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |key|count|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 0| 3|</span>
<span class="sd"> | 1| 6|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &gt;&gt;&gt; dataset.sampleBy(col(&quot;key&quot;), fractions={2: 1.0}, seed=0).count()</span>
<span class="sd"> 33</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.randomSplit"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.randomSplit.html#pyspark.sql.DataFrame.randomSplit">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">randomSplit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">weights</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;DataFrame&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Randomly splits this :class:`DataFrame` with the provided weights.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> weights : list</span>
<span class="sd"> list of doubles as weights with which to split the :class:`DataFrame`.</span>
<span class="sd"> Weights will be normalized if they don&#39;t sum up to 1.0.</span>
<span class="sd"> seed : int, optional</span>
<span class="sd"> The seed for sampling.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> list</span>
<span class="sd"> List of DataFrames.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... Row(age=10, height=80, name=&quot;Alice&quot;),</span>
<span class="sd"> ... Row(age=5, height=None, name=&quot;Bob&quot;),</span>
<span class="sd"> ... Row(age=None, height=None, name=&quot;Tom&quot;),</span>
<span class="sd"> ... Row(age=None, height=None, name=None),</span>
<span class="sd"> ... ])</span>
<span class="sd"> &gt;&gt;&gt; splits = df.randomSplit([1.0, 2.0], 24)</span>
<span class="sd"> &gt;&gt;&gt; splits[0].count()</span>
<span class="sd"> 2</span>
<span class="sd"> &gt;&gt;&gt; splits[1].count()</span>
<span class="sd"> 2</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">dtypes</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns all column names and their data types as a list.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> list</span>
<span class="sd"> List of columns as tuple pairs.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.dtypes</span>
<span class="sd"> [(&#39;age&#39;, &#39;bigint&#39;), (&#39;name&#39;, &#39;string&#39;)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">columns</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Retrieves the names of all columns in the :class:`DataFrame` as a list.</span>
<span class="sd"> The order of the column names in the list reflects their order in the DataFrame.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> list</span>
<span class="sd"> List of column names in the DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Example 1: Retrieve column names of a DataFrame</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;, &quot;CA&quot;), (23, &quot;Alice&quot;, &quot;NY&quot;), (16, &quot;Bob&quot;, &quot;TX&quot;)],</span>
<span class="sd"> ... [&quot;age&quot;, &quot;name&quot;, &quot;state&quot;]</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.columns</span>
<span class="sd"> [&#39;age&#39;, &#39;name&#39;, &#39;state&#39;]</span>
<span class="sd"> Example 2: Using column names to project specific columns</span>
<span class="sd"> &gt;&gt;&gt; selected_cols = [col for col in df.columns if col != &quot;age&quot;]</span>
<span class="sd"> &gt;&gt;&gt; df.select(selected_cols).show()</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> | name|state|</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> | Tom| CA|</span>
<span class="sd"> |Alice| NY|</span>
<span class="sd"> | Bob| TX|</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> Example 3: Checking if a specific column exists in a DataFrame</span>
<span class="sd"> &gt;&gt;&gt; &quot;state&quot; in df.columns</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; &quot;salary&quot; in df.columns</span>
<span class="sd"> False</span>
<span class="sd"> Example 4: Iterating over columns to apply a transformation</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as f</span>
<span class="sd"> &gt;&gt;&gt; for col_name in df.columns:</span>
<span class="sd"> ... df = df.withColumn(col_name, f.upper(f.col(col_name)))</span>
<span class="sd"> &gt;&gt;&gt; df.show()</span>
<span class="sd"> +---+-----+-----+</span>
<span class="sd"> |age| name|state|</span>
<span class="sd"> +---+-----+-----+</span>
<span class="sd"> | 14| TOM| CA|</span>
<span class="sd"> | 23|ALICE| NY|</span>
<span class="sd"> | 16| BOB| TX|</span>
<span class="sd"> +---+-----+-----+</span>
<span class="sd"> Example 5: Renaming columns and checking the updated column names</span>
<span class="sd"> &gt;&gt;&gt; df = df.withColumnRenamed(&quot;name&quot;, &quot;first_name&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.columns</span>
<span class="sd"> [&#39;age&#39;, &#39;first_name&#39;, &#39;state&#39;]</span>
<span class="sd"> Example 6: Using the `columns` property to ensure two DataFrames have the</span>
<span class="sd"> same columns before a union</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame(</span>
<span class="sd"> ... [(30, &quot;Eve&quot;, &quot;FL&quot;), (40, &quot;Sam&quot;, &quot;WA&quot;)], [&quot;age&quot;, &quot;name&quot;, &quot;location&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.columns == df2.columns</span>
<span class="sd"> False</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.colRegex"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.colRegex.html#pyspark.sql.DataFrame.colRegex">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">colRegex</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">colName</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Selects column based on the column name specified as a regex and returns it</span>
<span class="sd"> as :class:`Column`.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> colName : str</span>
<span class="sd"> string, column name specified as a regex.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`Column`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;a&quot;, 1), (&quot;b&quot;, 2), (&quot;c&quot;, 3)], [&quot;Col1&quot;, &quot;Col2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(df.colRegex(&quot;`(Col1)?+.+`&quot;)).show()</span>
<span class="sd"> +----+</span>
<span class="sd"> |Col2|</span>
<span class="sd"> +----+</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 2|</span>
<span class="sd"> | 3|</span>
<span class="sd"> +----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.to"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.to.html#pyspark.sql.DataFrame.to">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">to</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">schema</span><span class="p">:</span> <span class="n">StructType</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a new :class:`DataFrame` where each row is reconciled to match the specified</span>
<span class="sd"> schema.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> schema : :class:`StructType`</span>
<span class="sd"> Specified schema.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Reconciled DataFrame.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> * Reorder columns and/or inner fields by name to match the specified schema.</span>
<span class="sd"> * Project away columns and/or inner fields that are not needed by the specified schema.</span>
<span class="sd"> Missing columns and/or inner fields (present in the specified schema but not input</span>
<span class="sd"> DataFrame) lead to failures.</span>
<span class="sd"> * Cast the columns and/or inner fields to match the data types in the specified schema,</span>
<span class="sd"> if the types are compatible, e.g., numeric to numeric (error if overflows), but</span>
<span class="sd"> not string to int.</span>
<span class="sd"> * Carry over the metadata from the specified schema, while the columns and/or inner fields</span>
<span class="sd"> still keep their own metadata if not overwritten by the specified schema.</span>
<span class="sd"> * Fail if the nullability is not compatible. For example, the column and/or inner field</span>
<span class="sd"> is nullable but the specified schema requires them to be not nullable.</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.types import StructField, StringType</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;a&quot;, 1)], [&quot;i&quot;, &quot;j&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.schema</span>
<span class="sd"> StructType([StructField(&#39;i&#39;, StringType(), True), StructField(&#39;j&#39;, LongType(), True)])</span>
<span class="sd"> &gt;&gt;&gt; schema = StructType([StructField(&quot;j&quot;, StringType()), StructField(&quot;i&quot;, StringType())])</span>
<span class="sd"> &gt;&gt;&gt; df2 = df.to(schema)</span>
<span class="sd"> &gt;&gt;&gt; df2.schema</span>
<span class="sd"> StructType([StructField(&#39;j&#39;, StringType(), True), StructField(&#39;i&#39;, StringType(), True)])</span>
<span class="sd"> &gt;&gt;&gt; df2.show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | j| i|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | 1| a|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.alias"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.alias.html#pyspark.sql.DataFrame.alias">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">alias</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">alias</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a new :class:`DataFrame` with an alias set.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> alias : str</span>
<span class="sd"> an alias name to be set for the :class:`DataFrame`.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Aliased DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import col, desc</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df_as1 = df.alias(&quot;df_as1&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df_as2 = df.alias(&quot;df_as2&quot;)</span>
<span class="sd"> &gt;&gt;&gt; joined_df = df_as1.join(df_as2, col(&quot;df_as1.name&quot;) == col(&quot;df_as2.name&quot;), &#39;inner&#39;)</span>
<span class="sd"> &gt;&gt;&gt; joined_df.select(</span>
<span class="sd"> ... &quot;df_as1.name&quot;, &quot;df_as2.name&quot;, &quot;df_as2.age&quot;).sort(desc(&quot;df_as1.name&quot;)).show()</span>
<span class="sd"> +-----+-----+---+</span>
<span class="sd"> | name| name|age|</span>
<span class="sd"> +-----+-----+---+</span>
<span class="sd"> | Tom| Tom| 14|</span>
<span class="sd"> | Bob| Bob| 16|</span>
<span class="sd"> |Alice|Alice| 23|</span>
<span class="sd"> +-----+-----+---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.crossJoin"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.crossJoin.html#pyspark.sql.DataFrame.crossJoin">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">crossJoin</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the cartesian product with another :class:`DataFrame`.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> other : :class:`DataFrame`</span>
<span class="sd"> Right side of the cartesian product.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Joined DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame(</span>
<span class="sd"> ... [Row(height=80, name=&quot;Tom&quot;), Row(height=85, name=&quot;Bob&quot;)])</span>
<span class="sd"> &gt;&gt;&gt; df.crossJoin(df2.select(&quot;height&quot;)).select(&quot;age&quot;, &quot;name&quot;, &quot;height&quot;).show()</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> |age| name|height|</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> | 14| Tom| 80|</span>
<span class="sd"> | 14| Tom| 85|</span>
<span class="sd"> | 23|Alice| 80|</span>
<span class="sd"> | 23|Alice| 85|</span>
<span class="sd"> | 16| Bob| 80|</span>
<span class="sd"> | 16| Bob| 85|</span>
<span class="sd"> +---+-----+------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.join"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.join.html#pyspark.sql.DataFrame.join">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">join</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">other</span><span class="p">:</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">,</span>
<span class="n">on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Column</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">how</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Joins with another :class:`DataFrame`, using the given join expression.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> other : :class:`DataFrame`</span>
<span class="sd"> Right side of the join</span>
<span class="sd"> on : str, list or :class:`Column`, optional</span>
<span class="sd"> a string for the join column name, a list of column names,</span>
<span class="sd"> a join expression (Column), or a list of Columns.</span>
<span class="sd"> If `on` is a string or a list of strings indicating the name of the join column(s),</span>
<span class="sd"> the column(s) must exist on both sides, and this performs an equi-join.</span>
<span class="sd"> how : str, optional</span>
<span class="sd"> default ``inner``. Must be one of: ``inner``, ``cross``, ``outer``,</span>
<span class="sd"> ``full``, ``fullouter``, ``full_outer``, ``left``, ``leftouter``, ``left_outer``,</span>
<span class="sd"> ``right``, ``rightouter``, ``right_outer``, ``semi``, ``leftsemi``, ``left_semi``,</span>
<span class="sd"> ``anti``, ``leftanti`` and ``left_anti``.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Joined DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> The following examples demonstrate various join types among ``df1``, ``df2``, and ``df3``.</span>
<span class="sd"> &gt;&gt;&gt; import pyspark.sql.functions as sf</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([Row(name=&quot;Alice&quot;, age=2), Row(name=&quot;Bob&quot;, age=5)])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([Row(name=&quot;Tom&quot;, height=80), Row(name=&quot;Bob&quot;, height=85)])</span>
<span class="sd"> &gt;&gt;&gt; df3 = spark.createDataFrame([</span>
<span class="sd"> ... Row(name=&quot;Alice&quot;, age=10, height=80),</span>
<span class="sd"> ... Row(name=&quot;Bob&quot;, age=5, height=None),</span>
<span class="sd"> ... Row(name=&quot;Tom&quot;, age=None, height=None),</span>
<span class="sd"> ... Row(name=None, age=None, height=None),</span>
<span class="sd"> ... ])</span>
<span class="sd"> Inner join on columns (default)</span>
<span class="sd"> &gt;&gt;&gt; df.join(df2, &quot;name&quot;).show()</span>
<span class="sd"> +----+---+------+</span>
<span class="sd"> |name|age|height|</span>
<span class="sd"> +----+---+------+</span>
<span class="sd"> | Bob| 5| 85|</span>
<span class="sd"> +----+---+------+</span>
<span class="sd"> &gt;&gt;&gt; df.join(df3, [&quot;name&quot;, &quot;age&quot;]).show()</span>
<span class="sd"> +----+---+------+</span>
<span class="sd"> |name|age|height|</span>
<span class="sd"> +----+---+------+</span>
<span class="sd"> | Bob| 5| NULL|</span>
<span class="sd"> +----+---+------+</span>
<span class="sd"> Outer join on a single column with an explicit join condition.</span>
<span class="sd"> When the join condition is explicited stated: `df.name == df2.name`, this will</span>
<span class="sd"> produce all records where the names match, as well as those that don&#39;t (since</span>
<span class="sd"> it&#39;s an outer join). If there are names in `df2` that are not present in `df`,</span>
<span class="sd"> they will appear with `NULL` in the `name` column of `df`, and vice versa for `df2`.</span>
<span class="sd"> &gt;&gt;&gt; joined = df.join(df2, df.name == df2.name, &quot;outer&quot;).sort(sf.desc(df.name))</span>
<span class="sd"> &gt;&gt;&gt; joined.show() # doctest: +SKIP</span>
<span class="sd"> +-----+----+----+------+</span>
<span class="sd"> | name| age|name|height|</span>
<span class="sd"> +-----+----+----+------+</span>
<span class="sd"> | Bob| 5| Bob| 85|</span>
<span class="sd"> |Alice| 2|NULL| NULL|</span>
<span class="sd"> | NULL|NULL| Tom| 80|</span>
<span class="sd"> +-----+----+----+------+</span>
<span class="sd"> To unambiguously select output columns, specify the dataframe along with the column name:</span>
<span class="sd"> &gt;&gt;&gt; joined.select(df.name, df2.height).show() # doctest: +SKIP</span>
<span class="sd"> +-----+------+</span>
<span class="sd"> | name|height|</span>
<span class="sd"> +-----+------+</span>
<span class="sd"> | Bob| 85|</span>
<span class="sd"> |Alice| NULL|</span>
<span class="sd"> | NULL| 80|</span>
<span class="sd"> +-----+------+</span>
<span class="sd"> However, in self-joins, direct column references can cause ambiguity:</span>
<span class="sd"> &gt;&gt;&gt; df.join(df, df.name == df.name, &quot;outer&quot;).select(df.name).show() # doctest: +SKIP</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> pyspark.errors.exceptions.captured.AnalysisException: Column name#0 are ambiguous...</span>
<span class="sd"> A better approach is to assign aliases to the dataframes, and then reference</span>
<span class="sd"> the ouptut columns from the join operation using these aliases:</span>
<span class="sd"> &gt;&gt;&gt; df.alias(&quot;a&quot;).join(</span>
<span class="sd"> ... df.alias(&quot;b&quot;), sf.col(&quot;a.name&quot;) == sf.col(&quot;b.name&quot;), &quot;outer&quot;</span>
<span class="sd"> ... ).sort(sf.desc(&quot;a.name&quot;)).select(&quot;a.name&quot;, &quot;b.age&quot;).show()</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> | name|age|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> | Bob| 5|</span>
<span class="sd"> |Alice| 2|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> Outer join on a single column with implicit join condition using column name</span>
<span class="sd"> When you provide the column name directly as the join condition, Spark will treat</span>
<span class="sd"> both name columns as one, and will not produce separate columns for `df.name` and</span>
<span class="sd"> `df2.name`. This avoids having duplicate columns in the output.</span>
<span class="sd"> &gt;&gt;&gt; df.join(df2, &quot;name&quot;, &quot;outer&quot;).sort(sf.desc(&quot;name&quot;)).show()</span>
<span class="sd"> +-----+----+------+</span>
<span class="sd"> | name| age|height|</span>
<span class="sd"> +-----+----+------+</span>
<span class="sd"> | Tom|NULL| 80|</span>
<span class="sd"> | Bob| 5| 85|</span>
<span class="sd"> |Alice| 2| NULL|</span>
<span class="sd"> +-----+----+------+</span>
<span class="sd"> Outer join on multiple columns</span>
<span class="sd"> &gt;&gt;&gt; df.join(df3, [&quot;name&quot;, &quot;age&quot;], &quot;outer&quot;).sort(&quot;name&quot;, &quot;age&quot;).show()</span>
<span class="sd"> +-----+----+------+</span>
<span class="sd"> | name| age|height|</span>
<span class="sd"> +-----+----+------+</span>
<span class="sd"> | NULL|NULL| NULL|</span>
<span class="sd"> |Alice| 2| NULL|</span>
<span class="sd"> |Alice| 10| 80|</span>
<span class="sd"> | Bob| 5| NULL|</span>
<span class="sd"> | Tom|NULL| NULL|</span>
<span class="sd"> +-----+----+------+</span>
<span class="sd"> Left outer join on columns</span>
<span class="sd"> &gt;&gt;&gt; df.join(df2, &quot;name&quot;, &quot;left_outer&quot;).show()</span>
<span class="sd"> +-----+---+------+</span>
<span class="sd"> | name|age|height|</span>
<span class="sd"> +-----+---+------+</span>
<span class="sd"> |Alice| 2| NULL|</span>
<span class="sd"> | Bob| 5| 85|</span>
<span class="sd"> +-----+---+------+</span>
<span class="sd"> Right outer join on columns</span>
<span class="sd"> &gt;&gt;&gt; df.join(df2, &quot;name&quot;, &quot;right_outer&quot;).show()</span>
<span class="sd"> +----+----+------+</span>
<span class="sd"> |name| age|height|</span>
<span class="sd"> +----+----+------+</span>
<span class="sd"> | Tom|NULL| 80|</span>
<span class="sd"> | Bob| 5| 85|</span>
<span class="sd"> +----+----+------+</span>
<span class="sd"> Left semi join on columns</span>
<span class="sd"> &gt;&gt;&gt; df.join(df2, &quot;name&quot;, &quot;left_semi&quot;).show()</span>
<span class="sd"> +----+---+</span>
<span class="sd"> |name|age|</span>
<span class="sd"> +----+---+</span>
<span class="sd"> | Bob| 5|</span>
<span class="sd"> +----+---+</span>
<span class="sd"> Left anti join on columns</span>
<span class="sd"> &gt;&gt;&gt; df.join(df2, &quot;name&quot;, &quot;left_anti&quot;).show()</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> | name|age|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> |Alice| 2|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="c1"># TODO(SPARK-22947): Fix the DataFrame API.</span>
<span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">_joinAsOf</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">other</span><span class="p">:</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">,</span>
<span class="n">leftAsOfColumn</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span>
<span class="n">rightAsOfColumn</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span>
<span class="n">on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Column</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">how</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">tolerance</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Column</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">allowExactMatches</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="n">direction</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;backward&quot;</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Perform an as-of join.</span>
<span class="sd"> This is similar to a left-join except that we match on the nearest</span>
<span class="sd"> key rather than equal keys.</span>
<span class="sd"> .. versionchanged:: 4.0.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> other : :class:`DataFrame`</span>
<span class="sd"> Right side of the join</span>
<span class="sd"> leftAsOfColumn : str or :class:`Column`</span>
<span class="sd"> a string for the as-of join column name, or a Column</span>
<span class="sd"> rightAsOfColumn : str or :class:`Column`</span>
<span class="sd"> a string for the as-of join column name, or a Column</span>
<span class="sd"> on : str, list or :class:`Column`, optional</span>
<span class="sd"> a string for the join column name, a list of column names,</span>
<span class="sd"> a join expression (Column), or a list of Columns.</span>
<span class="sd"> If `on` is a string or a list of strings indicating the name of the join column(s),</span>
<span class="sd"> the column(s) must exist on both sides, and this performs an equi-join.</span>
<span class="sd"> how : str, optional</span>
<span class="sd"> default ``inner``. Must be one of: ``inner`` and ``left``.</span>
<span class="sd"> tolerance : :class:`Column`, optional</span>
<span class="sd"> an asof tolerance within this range; must be compatible</span>
<span class="sd"> with the merge index.</span>
<span class="sd"> allowExactMatches : bool, optional</span>
<span class="sd"> default ``True``.</span>
<span class="sd"> direction : str, optional</span>
<span class="sd"> default ``backward``. Must be one of: ``backward``, ``forward``, and ``nearest``.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> The following performs an as-of join between ``left`` and ``right``.</span>
<span class="sd"> &gt;&gt;&gt; left = spark.createDataFrame([(1, &quot;a&quot;), (5, &quot;b&quot;), (10, &quot;c&quot;)], [&quot;a&quot;, &quot;left_val&quot;])</span>
<span class="sd"> &gt;&gt;&gt; right = spark.createDataFrame([(1, 1), (2, 2), (3, 3), (6, 6), (7, 7)],</span>
<span class="sd"> ... [&quot;a&quot;, &quot;right_val&quot;])</span>
<span class="sd"> &gt;&gt;&gt; left._joinAsOf(</span>
<span class="sd"> ... right, leftAsOfColumn=&quot;a&quot;, rightAsOfColumn=&quot;a&quot;</span>
<span class="sd"> ... ).select(left.a, &#39;left_val&#39;, &#39;right_val&#39;).sort(&quot;a&quot;).collect()</span>
<span class="sd"> [Row(a=1, left_val=&#39;a&#39;, right_val=1),</span>
<span class="sd"> Row(a=5, left_val=&#39;b&#39;, right_val=3),</span>
<span class="sd"> Row(a=10, left_val=&#39;c&#39;, right_val=7)]</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import functions as sf</span>
<span class="sd"> &gt;&gt;&gt; left._joinAsOf(</span>
<span class="sd"> ... right, leftAsOfColumn=&quot;a&quot;, rightAsOfColumn=&quot;a&quot;, tolerance=sf.lit(1)</span>
<span class="sd"> ... ).select(left.a, &#39;left_val&#39;, &#39;right_val&#39;).sort(&quot;a&quot;).collect()</span>
<span class="sd"> [Row(a=1, left_val=&#39;a&#39;, right_val=1)]</span>
<span class="sd"> &gt;&gt;&gt; left._joinAsOf(</span>
<span class="sd"> ... right, leftAsOfColumn=&quot;a&quot;, rightAsOfColumn=&quot;a&quot;, how=&quot;left&quot;, tolerance=sf.lit(1)</span>
<span class="sd"> ... ).select(left.a, &#39;left_val&#39;, &#39;right_val&#39;).sort(&quot;a&quot;).collect()</span>
<span class="sd"> [Row(a=1, left_val=&#39;a&#39;, right_val=1),</span>
<span class="sd"> Row(a=5, left_val=&#39;b&#39;, right_val=None),</span>
<span class="sd"> Row(a=10, left_val=&#39;c&#39;, right_val=None)]</span>
<span class="sd"> &gt;&gt;&gt; left._joinAsOf(</span>
<span class="sd"> ... right, leftAsOfColumn=&quot;a&quot;, rightAsOfColumn=&quot;a&quot;, allowExactMatches=False</span>
<span class="sd"> ... ).select(left.a, &#39;left_val&#39;, &#39;right_val&#39;).sort(&quot;a&quot;).collect()</span>
<span class="sd"> [Row(a=5, left_val=&#39;b&#39;, right_val=3),</span>
<span class="sd"> Row(a=10, left_val=&#39;c&#39;, right_val=7)]</span>
<span class="sd"> &gt;&gt;&gt; left._joinAsOf(</span>
<span class="sd"> ... right, leftAsOfColumn=&quot;a&quot;, rightAsOfColumn=&quot;a&quot;, direction=&quot;forward&quot;</span>
<span class="sd"> ... ).select(left.a, &#39;left_val&#39;, &#39;right_val&#39;).sort(&quot;a&quot;).collect()</span>
<span class="sd"> [Row(a=1, left_val=&#39;a&#39;, right_val=1),</span>
<span class="sd"> Row(a=5, left_val=&#39;b&#39;, right_val=6)]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.sortWithinPartitions"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.sortWithinPartitions.html#pyspark.sql.DataFrame.sortWithinPartitions">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">sortWithinPartitions</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">]]],</span>
<span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a new :class:`DataFrame` with each partition sorted by the specified column(s).</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : int, str, list or :class:`Column`, optional</span>
<span class="sd"> list of :class:`Column` or column names or column ordinals to sort by.</span>
<span class="sd"> .. versionchanged:: 4.0.0</span>
<span class="sd"> Supports column ordinal.</span>
<span class="sd"> Other Parameters</span>
<span class="sd"> ----------------</span>
<span class="sd"> ascending : bool or list, optional, default True</span>
<span class="sd"> boolean or list of boolean.</span>
<span class="sd"> Sort ascending vs. descending. Specify list for multiple sort orders.</span>
<span class="sd"> If a list is specified, the length of the list must equal the length of the `cols`.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> DataFrame sorted by partitions.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> A column ordinal starts from 1, which is different from the</span>
<span class="sd"> 0-based :meth:`__getitem__`.</span>
<span class="sd"> If a column ordinal is negative, it means sort descending.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import functions as sf</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.sortWithinPartitions(&quot;age&quot;, ascending=False)</span>
<span class="sd"> DataFrame[age: bigint, name: string]</span>
<span class="sd"> &gt;&gt;&gt; df.coalesce(1).sortWithinPartitions(1).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &gt;&gt;&gt; df.coalesce(1).sortWithinPartitions(-1).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.sort"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.sort.html#pyspark.sql.DataFrame.sort">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">sort</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">]]],</span>
<span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a new :class:`DataFrame` sorted by the specified column(s).</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : int, str, list, or :class:`Column`, optional</span>
<span class="sd"> list of :class:`Column` or column names or column ordinals to sort by.</span>
<span class="sd"> .. versionchanged:: 4.0.0</span>
<span class="sd"> Supports column ordinal.</span>
<span class="sd"> Other Parameters</span>
<span class="sd"> ----------------</span>
<span class="sd"> ascending : bool or list, optional, default True</span>
<span class="sd"> boolean or list of boolean.</span>
<span class="sd"> Sort ascending vs. descending. Specify list for multiple sort orders.</span>
<span class="sd"> If a list is specified, the length of the list must equal the length of the `cols`.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Sorted DataFrame.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> A column ordinal starts from 1, which is different from the</span>
<span class="sd"> 0-based :meth:`__getitem__`.</span>
<span class="sd"> If a column ordinal is negative, it means sort descending.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import functions as sf</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> Sort the DataFrame in ascending order.</span>
<span class="sd"> &gt;&gt;&gt; df.sort(sf.asc(&quot;age&quot;)).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &gt;&gt;&gt; df.sort(1).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> Sort the DataFrame in descending order.</span>
<span class="sd"> &gt;&gt;&gt; df.sort(df.age.desc()).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &gt;&gt;&gt; df.orderBy(df.age.desc()).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &gt;&gt;&gt; df.sort(&quot;age&quot;, ascending=False).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &gt;&gt;&gt; df.sort(-1).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> Specify multiple columns</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import functions as sf</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (2, &quot;Alice&quot;), (2, &quot;Bob&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.orderBy(sf.desc(&quot;age&quot;), &quot;name&quot;).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 2| Bob|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &gt;&gt;&gt; df.orderBy(-1, &quot;name&quot;).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 2| Bob|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &gt;&gt;&gt; df.orderBy(-1, 2).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 2| Bob|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> Specify multiple columns for sorting order at `ascending`.</span>
<span class="sd"> &gt;&gt;&gt; df.orderBy([&quot;age&quot;, &quot;name&quot;], ascending=[False, False]).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> | 2| Bob|</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &gt;&gt;&gt; df.orderBy([1, &quot;name&quot;], ascending=[False, False]).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> | 2| Bob|</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &gt;&gt;&gt; df.orderBy([1, 2], ascending=[False, False]).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> | 2| Bob|</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="n">orderBy</span> <span class="o">=</span> <span class="n">sort</span>
<div class="viewcode-block" id="DataFrame.describe"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.describe.html#pyspark.sql.DataFrame.describe">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">describe</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Computes basic statistics for numeric and string columns.</span>
<span class="sd"> .. versionadded:: 1.3.1</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> This includes count, mean, stddev, min, and max. If no columns are</span>
<span class="sd"> given, this function computes statistics for all numerical or string columns.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This function is meant for exploratory data analysis, as we make no</span>
<span class="sd"> guarantee about the backward compatibility of the schema of the resulting</span>
<span class="sd"> :class:`DataFrame`.</span>
<span class="sd"> Use summary for expanded statistics and control over which statistics to compute.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : str, list, optional</span>
<span class="sd"> Column name or list of column names to describe by (default All columns).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> A new DataFrame that describes (provides statistics) given DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(&quot;Bob&quot;, 13, 40.3, 150.5), (&quot;Alice&quot;, 12, 37.8, 142.3), (&quot;Tom&quot;, 11, 44.1, 142.2)],</span>
<span class="sd"> ... [&quot;name&quot;, &quot;age&quot;, &quot;weight&quot;, &quot;height&quot;],</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.describe([&#39;age&#39;]).show()</span>
<span class="sd"> +-------+----+</span>
<span class="sd"> |summary| age|</span>
<span class="sd"> +-------+----+</span>
<span class="sd"> | count| 3|</span>
<span class="sd"> | mean|12.0|</span>
<span class="sd"> | stddev| 1.0|</span>
<span class="sd"> | min| 11|</span>
<span class="sd"> | max| 13|</span>
<span class="sd"> +-------+----+</span>
<span class="sd"> &gt;&gt;&gt; df.describe([&#39;age&#39;, &#39;weight&#39;, &#39;height&#39;]).show()</span>
<span class="sd"> +-------+----+------------------+-----------------+</span>
<span class="sd"> |summary| age| weight| height|</span>
<span class="sd"> +-------+----+------------------+-----------------+</span>
<span class="sd"> | count| 3| 3| 3|</span>
<span class="sd"> | mean|12.0| 40.73333333333333| 145.0|</span>
<span class="sd"> | stddev| 1.0|3.1722757341273704|4.763402145525822|</span>
<span class="sd"> | min| 11| 37.8| 142.2|</span>
<span class="sd"> | max| 13| 44.1| 150.5|</span>
<span class="sd"> +-------+----+------------------+-----------------+</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.summary</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.summary"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.summary.html#pyspark.sql.DataFrame.summary">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">summary</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">statistics</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Computes specified statistics for numeric and string columns. Available statistics are:</span>
<span class="sd"> - count</span>
<span class="sd"> - mean</span>
<span class="sd"> - stddev</span>
<span class="sd"> - min</span>
<span class="sd"> - max</span>
<span class="sd"> - arbitrary approximate percentiles specified as a percentage (e.g., 75%)</span>
<span class="sd"> If no statistics are given, this function computes count, mean, stddev, min,</span>
<span class="sd"> approximate quartiles (percentiles at 25%, 50%, and 75%), and max.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> statistics : str, optional</span>
<span class="sd"> Column names to calculate statistics by (default All columns).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> A new DataFrame that provides statistics for the given DataFrame.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This function is meant for exploratory data analysis, as we make no</span>
<span class="sd"> guarantee about the backward compatibility of the schema of the resulting</span>
<span class="sd"> :class:`DataFrame`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(&quot;Bob&quot;, 13, 40.3, 150.5), (&quot;Alice&quot;, 12, 37.8, 142.3), (&quot;Tom&quot;, 11, 44.1, 142.2)],</span>
<span class="sd"> ... [&quot;name&quot;, &quot;age&quot;, &quot;weight&quot;, &quot;height&quot;],</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.select(&quot;age&quot;, &quot;weight&quot;, &quot;height&quot;).summary().show()</span>
<span class="sd"> +-------+----+------------------+-----------------+</span>
<span class="sd"> |summary| age| weight| height|</span>
<span class="sd"> +-------+----+------------------+-----------------+</span>
<span class="sd"> | count| 3| 3| 3|</span>
<span class="sd"> | mean|12.0| 40.73333333333333| 145.0|</span>
<span class="sd"> | stddev| 1.0|3.1722757341273704|4.763402145525822|</span>
<span class="sd"> | min| 11| 37.8| 142.2|</span>
<span class="sd"> | 25%| 11| 37.8| 142.2|</span>
<span class="sd"> | 50%| 12| 40.3| 142.3|</span>
<span class="sd"> | 75%| 13| 44.1| 150.5|</span>
<span class="sd"> | max| 13| 44.1| 150.5|</span>
<span class="sd"> +-------+----+------------------+-----------------+</span>
<span class="sd"> &gt;&gt;&gt; df.select(&quot;age&quot;, &quot;weight&quot;, &quot;height&quot;).summary(&quot;count&quot;, &quot;min&quot;, &quot;25%&quot;, &quot;75%&quot;, &quot;max&quot;).show()</span>
<span class="sd"> +-------+---+------+------+</span>
<span class="sd"> |summary|age|weight|height|</span>
<span class="sd"> +-------+---+------+------+</span>
<span class="sd"> | count| 3| 3| 3|</span>
<span class="sd"> | min| 11| 37.8| 142.2|</span>
<span class="sd"> | 25%| 11| 37.8| 142.2|</span>
<span class="sd"> | 75%| 13| 44.1| 150.5|</span>
<span class="sd"> | max| 13| 44.1| 150.5|</span>
<span class="sd"> +-------+---+------+------+</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.display</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">head</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Row</span><span class="p">]:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">head</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">Row</span><span class="p">]:</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.head"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.head.html#pyspark.sql.DataFrame.head">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">head</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Optional</span><span class="p">[</span><span class="n">Row</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="n">Row</span><span class="p">]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the first ``n`` rows.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This method should only be used if the resulting array is expected</span>
<span class="sd"> to be small, as all the data is loaded into the driver&#39;s memory.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> n : int, optional</span>
<span class="sd"> default 1. Number of rows to return.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> If n is supplied, return a list of :class:`Row` of length n</span>
<span class="sd"> or less if the DataFrame has fewer elements.</span>
<span class="sd"> If n is missing, return a single Row.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.head()</span>
<span class="sd"> Row(age=2, name=&#39;Alice&#39;)</span>
<span class="sd"> &gt;&gt;&gt; df.head(1)</span>
<span class="sd"> [Row(age=2, name=&#39;Alice&#39;)]</span>
<span class="sd"> &gt;&gt;&gt; df.head(0)</span>
<span class="sd"> []</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.first"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.first.html#pyspark.sql.DataFrame.first">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">first</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Row</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the first row as a :class:`Row`.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`Row`</span>
<span class="sd"> First row if :class:`DataFrame` is not empty, otherwise ``None``.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.first()</span>
<span class="sd"> Row(age=2, name=&#39;Alice&#39;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.__getitem__"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.__getitem__.html#pyspark.sql.DataFrame.__getitem__">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the column as a :class:`Column`.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> item : int, str, :class:`Column`, list or tuple</span>
<span class="sd"> column index, column name, column, or a list or tuple of columns</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`Column` or :class:`DataFrame`</span>
<span class="sd"> a specified column, or a filtered or projected dataframe.</span>
<span class="sd"> * If the input `item` is an int or str, the output is a :class:`Column`.</span>
<span class="sd"> * If the input `item` is a :class:`Column`, the output is a :class:`DataFrame`</span>
<span class="sd"> filtered by this given :class:`Column`.</span>
<span class="sd"> * If the input `item` is a list or tuple, the output is a :class:`DataFrame`</span>
<span class="sd"> projected by this given list or tuple.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> Retrieve a column instance.</span>
<span class="sd"> &gt;&gt;&gt; df.select(df[&#39;age&#39;]).show()</span>
<span class="sd"> +---+</span>
<span class="sd"> |age|</span>
<span class="sd"> +---+</span>
<span class="sd"> | 2|</span>
<span class="sd"> | 5|</span>
<span class="sd"> +---+</span>
<span class="sd"> &gt;&gt;&gt; df.select(df[1]).show()</span>
<span class="sd"> +-----+</span>
<span class="sd"> | name|</span>
<span class="sd"> +-----+</span>
<span class="sd"> |Alice|</span>
<span class="sd"> | Bob|</span>
<span class="sd"> +-----+</span>
<span class="sd"> Select multiple string columns as index.</span>
<span class="sd"> &gt;&gt;&gt; df[[&quot;name&quot;, &quot;age&quot;]].show()</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> | name|age|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> |Alice| 2|</span>
<span class="sd"> | Bob| 5|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> &gt;&gt;&gt; df[df.age &gt; 3].show()</span>
<span class="sd"> +---+----+</span>
<span class="sd"> |age|name|</span>
<span class="sd"> +---+----+</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> +---+----+</span>
<span class="sd"> &gt;&gt;&gt; df[df[0] &gt; 3].show()</span>
<span class="sd"> +---+----+</span>
<span class="sd"> |age|name|</span>
<span class="sd"> +---+----+</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> +---+----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.__getattr__"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.__getattr__.html#pyspark.sql.DataFrame.__getattr__">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="fm">__getattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the :class:`Column` denoted by ``name``.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> name : str</span>
<span class="sd"> Column name to return as :class:`Column`.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`Column`</span>
<span class="sd"> Requested column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> Retrieve a column instance.</span>
<span class="sd"> &gt;&gt;&gt; df.select(df.age).show()</span>
<span class="sd"> +---+</span>
<span class="sd"> |age|</span>
<span class="sd"> +---+</span>
<span class="sd"> | 2|</span>
<span class="sd"> | 5|</span>
<span class="sd"> +---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="fm">__dir__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import lit</span>
<span class="sd"> Create a dataframe with a column named &#39;id&#39;.</span>
<span class="sd"> &gt;&gt;&gt; df = spark.range(3)</span>
<span class="sd"> &gt;&gt;&gt; [attr for attr in dir(df) if attr[0] == &#39;i&#39;][:7] # Includes column id</span>
<span class="sd"> [&#39;id&#39;, &#39;inputFiles&#39;, &#39;intersect&#39;, &#39;intersectAll&#39;, &#39;isEmpty&#39;, &#39;isLocal&#39;, &#39;isStreaming&#39;]</span>
<span class="sd"> Add a column named &#39;i_like_pancakes&#39;.</span>
<span class="sd"> &gt;&gt;&gt; df = df.withColumn(&#39;i_like_pancakes&#39;, lit(1))</span>
<span class="sd"> &gt;&gt;&gt; [attr for attr in dir(df) if attr[0] == &#39;i&#39;][:7] # Includes columns i_like_pancakes, id</span>
<span class="sd"> [&#39;i_like_pancakes&#39;, &#39;id&#39;, &#39;inputFiles&#39;, &#39;intersect&#39;, &#39;intersectAll&#39;, &#39;isEmpty&#39;, &#39;isLocal&#39;]</span>
<span class="sd"> Try to add an existed column &#39;inputFiles&#39;.</span>
<span class="sd"> &gt;&gt;&gt; df = df.withColumn(&#39;inputFiles&#39;, lit(2))</span>
<span class="sd"> &gt;&gt;&gt; [attr for attr in dir(df) if attr[0] == &#39;i&#39;][:7] # Doesn&#39;t duplicate inputFiles</span>
<span class="sd"> [&#39;i_like_pancakes&#39;, &#39;id&#39;, &#39;inputFiles&#39;, &#39;intersect&#39;, &#39;intersectAll&#39;, &#39;isEmpty&#39;, &#39;isLocal&#39;]</span>
<span class="sd"> Try to add a column named &#39;id2&#39;.</span>
<span class="sd"> &gt;&gt;&gt; df = df.withColumn(&#39;id2&#39;, lit(3))</span>
<span class="sd"> &gt;&gt;&gt; [attr for attr in dir(df) if attr[0] == &#39;i&#39;][:7] # result includes id2 and sorted</span>
<span class="sd"> [&#39;i_like_pancakes&#39;, &#39;id&#39;, &#39;id2&#39;, &#39;inputFiles&#39;, &#39;intersect&#39;, &#39;intersectAll&#39;, &#39;isEmpty&#39;]</span>
<span class="sd"> Don&#39;t include columns that are not valid python identifiers.</span>
<span class="sd"> &gt;&gt;&gt; df = df.withColumn(&#39;1&#39;, lit(4))</span>
<span class="sd"> &gt;&gt;&gt; df = df.withColumn(&#39;name 1&#39;, lit(5))</span>
<span class="sd"> &gt;&gt;&gt; [attr for attr in dir(df) if attr[0] == &#39;i&#39;][:7] # Doesn&#39;t include 1 or name 1</span>
<span class="sd"> [&#39;i_like_pancakes&#39;, &#39;id&#39;, &#39;id2&#39;, &#39;inputFiles&#39;, &#39;intersect&#39;, &#39;intersectAll&#39;, &#39;isEmpty&#39;]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">select</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">select</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">__cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.select"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.select.html#pyspark.sql.DataFrame.select">[docs]</a> <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span>
<span class="k">def</span> <span class="nf">select</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Projects a set of expressions and returns a new :class:`DataFrame`.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : str, :class:`Column`, or list</span>
<span class="sd"> column names (string) or expressions (:class:`Column`).</span>
<span class="sd"> If one of the column names is &#39;*&#39;, that column is expanded to include all columns</span>
<span class="sd"> in the current :class:`DataFrame`.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> A DataFrame with subset (or all) of columns.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> Select all columns in the DataFrame.</span>
<span class="sd"> &gt;&gt;&gt; df.select(&#39;*&#39;).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> Select a column with other expressions in the DataFrame.</span>
<span class="sd"> &gt;&gt;&gt; df.select(df.name, (df.age + 10).alias(&#39;age&#39;)).show()</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> | name|age|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> |Alice| 12|</span>
<span class="sd"> | Bob| 15|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">selectExpr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">expr</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">selectExpr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">expr</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.selectExpr"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.selectExpr.html#pyspark.sql.DataFrame.selectExpr">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">selectExpr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">expr</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Projects a set of SQL expressions and returns a new :class:`DataFrame`.</span>
<span class="sd"> This is a variant of :func:`select` that accepts SQL expressions.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> A DataFrame with new/old columns transformed by expressions.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.selectExpr(&quot;age * 2&quot;, &quot;abs(age)&quot;).show()</span>
<span class="sd"> +---------+--------+</span>
<span class="sd"> |(age * 2)|abs(age)|</span>
<span class="sd"> +---------+--------+</span>
<span class="sd"> | 4| 2|</span>
<span class="sd"> | 10| 5|</span>
<span class="sd"> +---------+--------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.filter"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.filter.html#pyspark.sql.DataFrame.filter">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">filter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">condition</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Filters rows using the given condition.</span>
<span class="sd"> :func:`where` is an alias for :func:`filter`.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> condition : :class:`Column` or str</span>
<span class="sd"> A :class:`Column` of :class:`types.BooleanType`</span>
<span class="sd"> or a string of SQL expressions.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> A new DataFrame with rows that satisfy the condition.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (2, &quot;Alice&quot;, &quot;Math&quot;), (5, &quot;Bob&quot;, &quot;Physics&quot;), (7, &quot;Charlie&quot;, &quot;Chemistry&quot;)],</span>
<span class="sd"> ... schema=[&quot;age&quot;, &quot;name&quot;, &quot;subject&quot;])</span>
<span class="sd"> Filter by :class:`Column` instances.</span>
<span class="sd"> &gt;&gt;&gt; df.filter(df.age &gt; 3).show()</span>
<span class="sd"> +---+-------+---------+</span>
<span class="sd"> |age| name| subject|</span>
<span class="sd"> +---+-------+---------+</span>
<span class="sd"> | 5| Bob| Physics|</span>
<span class="sd"> | 7|Charlie|Chemistry|</span>
<span class="sd"> +---+-------+---------+</span>
<span class="sd"> &gt;&gt;&gt; df.where(df.age == 2).show()</span>
<span class="sd"> +---+-----+-------+</span>
<span class="sd"> |age| name|subject|</span>
<span class="sd"> +---+-----+-------+</span>
<span class="sd"> | 2|Alice| Math|</span>
<span class="sd"> +---+-----+-------+</span>
<span class="sd"> Filter by SQL expression in a string.</span>
<span class="sd"> &gt;&gt;&gt; df.filter(&quot;age &gt; 3&quot;).show()</span>
<span class="sd"> +---+-------+---------+</span>
<span class="sd"> |age| name| subject|</span>
<span class="sd"> +---+-------+---------+</span>
<span class="sd"> | 5| Bob| Physics|</span>
<span class="sd"> | 7|Charlie|Chemistry|</span>
<span class="sd"> +---+-------+---------+</span>
<span class="sd"> &gt;&gt;&gt; df.where(&quot;age = 2&quot;).show()</span>
<span class="sd"> +---+-----+-------+</span>
<span class="sd"> |age| name|subject|</span>
<span class="sd"> +---+-----+-------+</span>
<span class="sd"> | 2|Alice| Math|</span>
<span class="sd"> +---+-----+-------+</span>
<span class="sd"> Filter by multiple conditions.</span>
<span class="sd"> &gt;&gt;&gt; df.filter((df.age &gt; 3) &amp; (df.subject == &quot;Physics&quot;)).show()</span>
<span class="sd"> +---+----+-------+</span>
<span class="sd"> |age|name|subject|</span>
<span class="sd"> +---+----+-------+</span>
<span class="sd"> | 5| Bob|Physics|</span>
<span class="sd"> +---+----+-------+</span>
<span class="sd"> &gt;&gt;&gt; df.filter((df.age == 2) | (df.subject == &quot;Chemistry&quot;)).show()</span>
<span class="sd"> +---+-------+---------+</span>
<span class="sd"> |age| name| subject|</span>
<span class="sd"> +---+-------+---------+</span>
<span class="sd"> | 2| Alice| Math|</span>
<span class="sd"> | 7|Charlie|Chemistry|</span>
<span class="sd"> +---+-------+---------+</span>
<span class="sd"> Filter by multiple conditions using SQL expression.</span>
<span class="sd"> &gt;&gt;&gt; df.filter(&quot;age &gt; 3 AND name = &#39;Bob&#39;&quot;).show()</span>
<span class="sd"> +---+----+-------+</span>
<span class="sd"> |age|name|subject|</span>
<span class="sd"> +---+----+-------+</span>
<span class="sd"> | 5| Bob|Physics|</span>
<span class="sd"> +---+----+-------+</span>
<span class="sd"> Filter using the :func:`Column.isin` function.</span>
<span class="sd"> &gt;&gt;&gt; df.filter(df.name.isin(&quot;Alice&quot;, &quot;Bob&quot;)).show()</span>
<span class="sd"> +---+-----+-------+</span>
<span class="sd"> |age| name|subject|</span>
<span class="sd"> +---+-----+-------+</span>
<span class="sd"> | 2|Alice| Math|</span>
<span class="sd"> | 5| Bob|Physics|</span>
<span class="sd"> +---+-----+-------+</span>
<span class="sd"> Filter by a list of values using the :func:`Column.isin` function.</span>
<span class="sd"> &gt;&gt;&gt; df.filter(df.subject.isin([&quot;Math&quot;, &quot;Physics&quot;])).show()</span>
<span class="sd"> +---+-----+-------+</span>
<span class="sd"> |age| name|subject|</span>
<span class="sd"> +---+-----+-------+</span>
<span class="sd"> | 2|Alice| Math|</span>
<span class="sd"> | 5| Bob|Physics|</span>
<span class="sd"> +---+-----+-------+</span>
<span class="sd"> Filter using the `~` operator to exclude certain values.</span>
<span class="sd"> &gt;&gt;&gt; df.filter(~df.name.isin([&quot;Alice&quot;, &quot;Charlie&quot;])).show()</span>
<span class="sd"> +---+----+-------+</span>
<span class="sd"> |age|name|subject|</span>
<span class="sd"> +---+----+-------+</span>
<span class="sd"> | 5| Bob|Physics|</span>
<span class="sd"> +---+----+-------+</span>
<span class="sd"> Filter using the :func:`Column.isNotNull` function.</span>
<span class="sd"> &gt;&gt;&gt; df.filter(df.name.isNotNull()).show()</span>
<span class="sd"> +---+-------+---------+</span>
<span class="sd"> |age| name| subject|</span>
<span class="sd"> +---+-------+---------+</span>
<span class="sd"> | 2| Alice| Math|</span>
<span class="sd"> | 5| Bob| Physics|</span>
<span class="sd"> | 7|Charlie|Chemistry|</span>
<span class="sd"> +---+-------+---------+</span>
<span class="sd"> Filter using the :func:`Column.like` function.</span>
<span class="sd"> &gt;&gt;&gt; df.filter(df.name.like(&quot;Al%&quot;)).show()</span>
<span class="sd"> +---+-----+-------+</span>
<span class="sd"> |age| name|subject|</span>
<span class="sd"> +---+-----+-------+</span>
<span class="sd"> | 2|Alice| Math|</span>
<span class="sd"> +---+-----+-------+</span>
<span class="sd"> Filter using the :func:`Column.contains` function.</span>
<span class="sd"> &gt;&gt;&gt; df.filter(df.name.contains(&quot;i&quot;)).show()</span>
<span class="sd"> +---+-------+---------+</span>
<span class="sd"> |age| name| subject|</span>
<span class="sd"> +---+-------+---------+</span>
<span class="sd"> | 2| Alice| Math|</span>
<span class="sd"> | 7|Charlie|Chemistry|</span>
<span class="sd"> +---+-------+---------+</span>
<span class="sd"> Filter using the :func:`Column.between` function.</span>
<span class="sd"> &gt;&gt;&gt; df.filter(df.age.between(2, 5)).show()</span>
<span class="sd"> +---+-----+-------+</span>
<span class="sd"> |age| name|subject|</span>
<span class="sd"> +---+-----+-------+</span>
<span class="sd"> | 2|Alice| Math|</span>
<span class="sd"> | 5| Bob|Physics|</span>
<span class="sd"> +---+-----+-------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">groupBy</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrNameOrOrdinal&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GroupedData&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">groupBy</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">__cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="s2">&quot;GroupedData&quot;</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.groupBy"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.groupBy.html#pyspark.sql.DataFrame.groupBy">[docs]</a> <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span>
<span class="k">def</span> <span class="nf">groupBy</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrNameOrOrdinal&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GroupedData&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Groups the :class:`DataFrame` by the specified columns so that aggregation</span>
<span class="sd"> can be performed on them.</span>
<span class="sd"> See :class:`GroupedData` for all the available aggregate functions.</span>
<span class="sd"> :func:`groupby` is an alias for :func:`groupBy`.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : list, str, int or :class:`Column`</span>
<span class="sd"> The columns to group by.</span>
<span class="sd"> Each element can be a column name (string) or an expression (:class:`Column`)</span>
<span class="sd"> or a column ordinal (int, 1-based) or list of them.</span>
<span class="sd"> .. versionchanged:: 4.0.0</span>
<span class="sd"> Supports column ordinal.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`GroupedData`</span>
<span class="sd"> A :class:`GroupedData` object representing the grouped data by the specified columns.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> A column ordinal starts from 1, which is different from the</span>
<span class="sd"> 0-based :meth:`__getitem__`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (&quot;Alice&quot;, 2), (&quot;Bob&quot;, 2), (&quot;Bob&quot;, 2), (&quot;Bob&quot;, 5)], schema=[&quot;name&quot;, &quot;age&quot;])</span>
<span class="sd"> Example 1: Empty grouping columns triggers a global aggregation.</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy().avg().show()</span>
<span class="sd"> +--------+</span>
<span class="sd"> |avg(age)|</span>
<span class="sd"> +--------+</span>
<span class="sd"> | 2.75|</span>
<span class="sd"> +--------+</span>
<span class="sd"> Example 2: Group-by &#39;name&#39;, and specify a dictionary to calculate the summation of &#39;age&#39;.</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy(&quot;name&quot;).agg({&quot;age&quot;: &quot;sum&quot;}).sort(&quot;name&quot;).show()</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> | name|sum(age)|</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> |Alice| 2|</span>
<span class="sd"> | Bob| 9|</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> Example 3: Group-by &#39;name&#39;, and calculate maximum values.</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy(df.name).max().sort(&quot;name&quot;).show()</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> | name|max(age)|</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> |Alice| 2|</span>
<span class="sd"> | Bob| 5|</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> Example 4: Also group-by &#39;name&#39;, but using the column ordinal.</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy(1).max().sort(&quot;name&quot;).show()</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> | name|max(age)|</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> |Alice| 2|</span>
<span class="sd"> | Bob| 5|</span>
<span class="sd"> +-----+--------+</span>
<span class="sd"> Example 5: Group-by &#39;name&#39; and &#39;age&#39;, and calculate the number of rows in each group.</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy([&quot;name&quot;, df.age]).count().sort(&quot;name&quot;, &quot;age&quot;).show()</span>
<span class="sd"> +-----+---+-----+</span>
<span class="sd"> | name|age|count|</span>
<span class="sd"> +-----+---+-----+</span>
<span class="sd"> |Alice| 2| 1|</span>
<span class="sd"> | Bob| 2| 2|</span>
<span class="sd"> | Bob| 5| 1|</span>
<span class="sd"> +-----+---+-----+</span>
<span class="sd"> Example 6: Also Group-by &#39;name&#39; and &#39;age&#39;, but using the column ordinal.</span>
<span class="sd"> &gt;&gt;&gt; df.groupBy([df.name, 2]).count().sort(&quot;name&quot;, &quot;age&quot;).show()</span>
<span class="sd"> +-----+---+-----+</span>
<span class="sd"> | name|age|count|</span>
<span class="sd"> +-----+---+-----+</span>
<span class="sd"> |Alice| 2| 1|</span>
<span class="sd"> | Bob| 2| 2|</span>
<span class="sd"> | Bob| 5| 1|</span>
<span class="sd"> +-----+---+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">rollup</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GroupedData&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">rollup</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">__cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="s2">&quot;GroupedData&quot;</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.rollup"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.rollup.html#pyspark.sql.DataFrame.rollup">[docs]</a> <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span>
<span class="k">def</span> <span class="nf">rollup</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrNameOrOrdinal&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GroupedData&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Create a multi-dimensional rollup for the current :class:`DataFrame` using</span>
<span class="sd"> the specified columns, allowing for aggregation on them.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : list, str, int or :class:`Column`</span>
<span class="sd"> The columns to roll-up by.</span>
<span class="sd"> Each element should be a column name (string) or an expression (:class:`Column`)</span>
<span class="sd"> or a column ordinal (int, 1-based) or list of them.</span>
<span class="sd"> .. versionchanged:: 4.0.0</span>
<span class="sd"> Supports column ordinal.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`GroupedData`</span>
<span class="sd"> Rolled-up data based on the specified columns.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> A column ordinal starts from 1, which is different from the</span>
<span class="sd"> 0-based :meth:`__getitem__`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;Alice&quot;, 2), (&quot;Bob&quot;, 5)], schema=[&quot;name&quot;, &quot;age&quot;])</span>
<span class="sd"> Example 1: Rollup-by &#39;name&#39;, and calculate the number of rows in each dimensional.</span>
<span class="sd"> &gt;&gt;&gt; df.rollup(&quot;name&quot;).count().orderBy(&quot;name&quot;).show()</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> | name|count|</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> | NULL| 2|</span>
<span class="sd"> |Alice| 1|</span>
<span class="sd"> | Bob| 1|</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> Example 2: Rollup-by &#39;name&#39; and &#39;age&#39;,</span>
<span class="sd"> and calculate the number of rows in each dimensional.</span>
<span class="sd"> &gt;&gt;&gt; df.rollup(&quot;name&quot;, df.age).count().orderBy(&quot;name&quot;, &quot;age&quot;).show()</span>
<span class="sd"> +-----+----+-----+</span>
<span class="sd"> | name| age|count|</span>
<span class="sd"> +-----+----+-----+</span>
<span class="sd"> | NULL|NULL| 2|</span>
<span class="sd"> |Alice|NULL| 1|</span>
<span class="sd"> |Alice| 2| 1|</span>
<span class="sd"> | Bob|NULL| 1|</span>
<span class="sd"> | Bob| 5| 1|</span>
<span class="sd"> +-----+----+-----+</span>
<span class="sd"> Example 3: Also Rollup-by &#39;name&#39; and &#39;age&#39;, but using the column ordinal.</span>
<span class="sd"> &gt;&gt;&gt; df.rollup(1, 2).count().orderBy(1, 2).show()</span>
<span class="sd"> +-----+----+-----+</span>
<span class="sd"> | name| age|count|</span>
<span class="sd"> +-----+----+-----+</span>
<span class="sd"> | NULL|NULL| 2|</span>
<span class="sd"> |Alice|NULL| 1|</span>
<span class="sd"> |Alice| 2| 1|</span>
<span class="sd"> | Bob|NULL| 1|</span>
<span class="sd"> | Bob| 5| 1|</span>
<span class="sd"> +-----+----+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">cube</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GroupedData&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">cube</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">__cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="s2">&quot;GroupedData&quot;</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.cube"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.cube.html#pyspark.sql.DataFrame.cube">[docs]</a> <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span>
<span class="k">def</span> <span class="nf">cube</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GroupedData&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Create a multi-dimensional cube for the current :class:`DataFrame` using</span>
<span class="sd"> the specified columns, allowing aggregations to be performed on them.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : list, str, int or :class:`Column`</span>
<span class="sd"> The columns to cube by.</span>
<span class="sd"> Each element should be a column name (string) or an expression (:class:`Column`)</span>
<span class="sd"> or a column ordinal (int, 1-based) or list of them.</span>
<span class="sd"> .. versionchanged:: 4.0.0</span>
<span class="sd"> Supports column ordinal.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`GroupedData`</span>
<span class="sd"> Cube of the data based on the specified columns.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> A column ordinal starts from 1, which is different from the</span>
<span class="sd"> 0-based :meth:`__getitem__`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(&quot;Alice&quot;, 2), (&quot;Bob&quot;, 5)], schema=[&quot;name&quot;, &quot;age&quot;])</span>
<span class="sd"> Example 1: Creating a cube on &#39;name&#39;,</span>
<span class="sd"> and calculate the number of rows in each dimensional.</span>
<span class="sd"> &gt;&gt;&gt; df.cube(&quot;name&quot;).count().orderBy(&quot;name&quot;).show()</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> | name|count|</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> | NULL| 2|</span>
<span class="sd"> |Alice| 1|</span>
<span class="sd"> | Bob| 1|</span>
<span class="sd"> +-----+-----+</span>
<span class="sd"> Example 2: Creating a cube on &#39;name&#39; and &#39;age&#39;,</span>
<span class="sd"> and calculate the number of rows in each dimensional.</span>
<span class="sd"> &gt;&gt;&gt; df.cube(&quot;name&quot;, df.age).count().orderBy(&quot;name&quot;, &quot;age&quot;).show()</span>
<span class="sd"> +-----+----+-----+</span>
<span class="sd"> | name| age|count|</span>
<span class="sd"> +-----+----+-----+</span>
<span class="sd"> | NULL|NULL| 2|</span>
<span class="sd"> | NULL| 2| 1|</span>
<span class="sd"> | NULL| 5| 1|</span>
<span class="sd"> |Alice|NULL| 1|</span>
<span class="sd"> |Alice| 2| 1|</span>
<span class="sd"> | Bob|NULL| 1|</span>
<span class="sd"> | Bob| 5| 1|</span>
<span class="sd"> +-----+----+-----+</span>
<span class="sd"> Example 3: Also creating a cube on &#39;name&#39; and &#39;age&#39;, but using the column ordinal.</span>
<span class="sd"> &gt;&gt;&gt; df.cube(1, 2).count().orderBy(1, 2).show()</span>
<span class="sd"> +-----+----+-----+</span>
<span class="sd"> | name| age|count|</span>
<span class="sd"> +-----+----+-----+</span>
<span class="sd"> | NULL|NULL| 2|</span>
<span class="sd"> | NULL| 2| 1|</span>
<span class="sd"> | NULL| 5| 1|</span>
<span class="sd"> |Alice|NULL| 1|</span>
<span class="sd"> |Alice| 2| 1|</span>
<span class="sd"> | Bob|NULL| 1|</span>
<span class="sd"> | Bob| 5| 1|</span>
<span class="sd"> +-----+----+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.groupingSets"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.groupingSets.html#pyspark.sql.DataFrame.groupingSets">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">groupingSets</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">groupingSets</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Sequence</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">]],</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GroupedData&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Create multi-dimensional aggregation for the current `class`:DataFrame using the specified</span>
<span class="sd"> grouping sets, so we can run aggregation on them.</span>
<span class="sd"> .. versionadded:: 4.0.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> groupingSets : sequence of sequence of columns or str</span>
<span class="sd"> Individual set of columns to group on.</span>
<span class="sd"> cols : :class:`Column` or str</span>
<span class="sd"> Addional grouping columns specified by users.</span>
<span class="sd"> Those columns are shown as the output columns after aggregation.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`GroupedData`</span>
<span class="sd"> Grouping sets of the data based on the specified columns.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Example 1: Group by city and car_model, city, and all, and calculate the sum of quantity.</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import functions as sf</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (100, &#39;Fremont&#39;, &#39;Honda Civic&#39;, 10),</span>
<span class="sd"> ... (100, &#39;Fremont&#39;, &#39;Honda Accord&#39;, 15),</span>
<span class="sd"> ... (100, &#39;Fremont&#39;, &#39;Honda CRV&#39;, 7),</span>
<span class="sd"> ... (200, &#39;Dublin&#39;, &#39;Honda Civic&#39;, 20),</span>
<span class="sd"> ... (200, &#39;Dublin&#39;, &#39;Honda Accord&#39;, 10),</span>
<span class="sd"> ... (200, &#39;Dublin&#39;, &#39;Honda CRV&#39;, 3),</span>
<span class="sd"> ... (300, &#39;San Jose&#39;, &#39;Honda Civic&#39;, 5),</span>
<span class="sd"> ... (300, &#39;San Jose&#39;, &#39;Honda Accord&#39;, 8)</span>
<span class="sd"> ... ], schema=&quot;id INT, city STRING, car_model STRING, quantity INT&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.groupingSets(</span>
<span class="sd"> ... [(&quot;city&quot;, &quot;car_model&quot;), (&quot;city&quot;,), ()],</span>
<span class="sd"> ... &quot;city&quot;, &quot;car_model&quot;</span>
<span class="sd"> ... ).agg(sf.sum(sf.col(&quot;quantity&quot;)).alias(&quot;sum&quot;)).sort(&quot;city&quot;, &quot;car_model&quot;).show()</span>
<span class="sd"> +--------+------------+---+</span>
<span class="sd"> | city| car_model|sum|</span>
<span class="sd"> +--------+------------+---+</span>
<span class="sd"> | NULL| NULL| 78|</span>
<span class="sd"> | Dublin| NULL| 33|</span>
<span class="sd"> | Dublin|Honda Accord| 10|</span>
<span class="sd"> | Dublin| Honda CRV| 3|</span>
<span class="sd"> | Dublin| Honda Civic| 20|</span>
<span class="sd"> | Fremont| NULL| 32|</span>
<span class="sd"> | Fremont|Honda Accord| 15|</span>
<span class="sd"> | Fremont| Honda CRV| 7|</span>
<span class="sd"> | Fremont| Honda Civic| 10|</span>
<span class="sd"> |San Jose| NULL| 13|</span>
<span class="sd"> |San Jose|Honda Accord| 8|</span>
<span class="sd"> |San Jose| Honda Civic| 5|</span>
<span class="sd"> +--------+------------+---+</span>
<span class="sd"> Example 2: Group by multiple columns and calculate both average and sum.</span>
<span class="sd"> &gt;&gt;&gt; df.groupingSets(</span>
<span class="sd"> ... [(&quot;city&quot;, &quot;car_model&quot;), (&quot;city&quot;,), ()],</span>
<span class="sd"> ... &quot;city&quot;, &quot;car_model&quot;</span>
<span class="sd"> ... ).agg(</span>
<span class="sd"> ... sf.avg(sf.col(&quot;quantity&quot;)).alias(&quot;avg_quantity&quot;),</span>
<span class="sd"> ... sf.sum(sf.col(&quot;quantity&quot;)).alias(&quot;sum_quantity&quot;)</span>
<span class="sd"> ... ).sort(&quot;city&quot;, &quot;car_model&quot;).show()</span>
<span class="sd"> +--------+------------+------------------+------------+</span>
<span class="sd"> | city| car_model| avg_quantity|sum_quantity|</span>
<span class="sd"> +--------+------------+------------------+------------+</span>
<span class="sd"> | NULL| NULL| 9.75| 78|</span>
<span class="sd"> | Dublin| NULL| 11.0| 33|</span>
<span class="sd"> | Dublin|Honda Accord| 10.0| 10|</span>
<span class="sd"> | Dublin| Honda CRV| 3.0| 3|</span>
<span class="sd"> | Dublin| Honda Civic| 20.0| 20|</span>
<span class="sd"> | Fremont| NULL|10.666666666666666| 32|</span>
<span class="sd"> | Fremont|Honda Accord| 15.0| 15|</span>
<span class="sd"> | Fremont| Honda CRV| 7.0| 7|</span>
<span class="sd"> | Fremont| Honda Civic| 10.0| 10|</span>
<span class="sd"> |San Jose| NULL| 6.5| 13|</span>
<span class="sd"> |San Jose|Honda Accord| 8.0| 8|</span>
<span class="sd"> |San Jose| Honda Civic| 5.0| 5|</span>
<span class="sd"> +--------+------------+------------------+------------+</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> GroupedData</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.unpivot"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.unpivot.html#pyspark.sql.DataFrame.unpivot">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">unpivot</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">ids</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="o">...</span><span class="p">]],</span>
<span class="n">values</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="o">...</span><span class="p">]]],</span>
<span class="n">variableColumnName</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">valueColumnName</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Unpivot a DataFrame from wide format to long format, optionally leaving</span>
<span class="sd"> identifier columns set. This is the reverse to `groupBy(...).pivot(...).agg(...)`,</span>
<span class="sd"> except for the aggregation, which cannot be reversed.</span>
<span class="sd"> This function is useful to massage a DataFrame into a format where some</span>
<span class="sd"> columns are identifier columns (&quot;ids&quot;), while all other columns (&quot;values&quot;)</span>
<span class="sd"> are &quot;unpivoted&quot; to the rows, leaving just two non-id columns, named as given</span>
<span class="sd"> by `variableColumnName` and `valueColumnName`.</span>
<span class="sd"> When no &quot;id&quot; columns are given, the unpivoted DataFrame consists of only the</span>
<span class="sd"> &quot;variable&quot; and &quot;value&quot; columns.</span>
<span class="sd"> The `values` columns must not be empty so at least one value must be given to be unpivoted.</span>
<span class="sd"> When `values` is `None`, all non-id columns will be unpivoted.</span>
<span class="sd"> All &quot;value&quot; columns must share a least common data type. Unless they are the same data type,</span>
<span class="sd"> all &quot;value&quot; columns are cast to the nearest common data type. For instance, types</span>
<span class="sd"> `IntegerType` and `LongType` are cast to `LongType`, while `IntegerType` and `StringType`</span>
<span class="sd"> do not have a common data type and `unpivot` fails.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> ids : str, Column, tuple, list</span>
<span class="sd"> Column(s) to use as identifiers. Can be a single column or column name,</span>
<span class="sd"> or a list or tuple for multiple columns.</span>
<span class="sd"> values : str, Column, tuple, list, optional</span>
<span class="sd"> Column(s) to unpivot. Can be a single column or column name, or a list or tuple</span>
<span class="sd"> for multiple columns. If specified, must not be empty. If not specified, uses all</span>
<span class="sd"> columns that are not set as `ids`.</span>
<span class="sd"> variableColumnName : str</span>
<span class="sd"> Name of the variable column.</span>
<span class="sd"> valueColumnName : str</span>
<span class="sd"> Name of the value column.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Unpivoted DataFrame.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(1, 11, 1.1), (2, 12, 1.2)],</span>
<span class="sd"> ... [&quot;id&quot;, &quot;int&quot;, &quot;double&quot;],</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; df.show()</span>
<span class="sd"> +---+---+------+</span>
<span class="sd"> | id|int|double|</span>
<span class="sd"> +---+---+------+</span>
<span class="sd"> | 1| 11| 1.1|</span>
<span class="sd"> | 2| 12| 1.2|</span>
<span class="sd"> +---+---+------+</span>
<span class="sd"> &gt;&gt;&gt; df.unpivot(&quot;id&quot;, [&quot;int&quot;, &quot;double&quot;], &quot;var&quot;, &quot;val&quot;).show()</span>
<span class="sd"> +---+------+----+</span>
<span class="sd"> | id| var| val|</span>
<span class="sd"> +---+------+----+</span>
<span class="sd"> | 1| int|11.0|</span>
<span class="sd"> | 1|double| 1.1|</span>
<span class="sd"> | 2| int|12.0|</span>
<span class="sd"> | 2|double| 1.2|</span>
<span class="sd"> +---+------+----+</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.melt</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.melt"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.melt.html#pyspark.sql.DataFrame.melt">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">melt</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">ids</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="o">...</span><span class="p">]],</span>
<span class="n">values</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">&quot;ColumnOrName&quot;</span><span class="p">,</span> <span class="o">...</span><span class="p">]]],</span>
<span class="n">variableColumnName</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">valueColumnName</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Unpivot a DataFrame from wide format to long format, optionally leaving</span>
<span class="sd"> identifier columns set. This is the reverse to `groupBy(...).pivot(...).agg(...)`,</span>
<span class="sd"> except for the aggregation, which cannot be reversed.</span>
<span class="sd"> :func:`melt` is an alias for :func:`unpivot`.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> ids : str, Column, tuple, list, optional</span>
<span class="sd"> Column(s) to use as identifiers. Can be a single column or column name,</span>
<span class="sd"> or a list or tuple for multiple columns.</span>
<span class="sd"> values : str, Column, tuple, list, optional</span>
<span class="sd"> Column(s) to unpivot. Can be a single column or column name, or a list or tuple</span>
<span class="sd"> for multiple columns. If not specified or empty, use all columns that</span>
<span class="sd"> are not set as `ids`.</span>
<span class="sd"> variableColumnName : str</span>
<span class="sd"> Name of the variable column.</span>
<span class="sd"> valueColumnName : str</span>
<span class="sd"> Name of the value column.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Unpivoted DataFrame.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.unpivot</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.agg"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.agg.html#pyspark.sql.DataFrame.agg">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">agg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">exprs</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Aggregate on the entire :class:`DataFrame` without groups</span>
<span class="sd"> (shorthand for ``df.groupBy().agg()``).</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> exprs : :class:`Column` or dict of key and value strings</span>
<span class="sd"> Columns or expressions to aggregate DataFrame by.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Aggregated DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import functions as sf</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.agg({&quot;age&quot;: &quot;max&quot;}).show()</span>
<span class="sd"> +--------+</span>
<span class="sd"> |max(age)|</span>
<span class="sd"> +--------+</span>
<span class="sd"> | 5|</span>
<span class="sd"> +--------+</span>
<span class="sd"> &gt;&gt;&gt; df.agg(sf.min(df.age)).show()</span>
<span class="sd"> +--------+</span>
<span class="sd"> |min(age)|</span>
<span class="sd"> +--------+</span>
<span class="sd"> | 2|</span>
<span class="sd"> +--------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.observe"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.observe.html#pyspark.sql.DataFrame.observe">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">observe</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">observation</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;Observation&quot;</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span>
<span class="o">*</span><span class="n">exprs</span><span class="p">:</span> <span class="n">Column</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Define (named) metrics to observe on the DataFrame. This method returns an &#39;observed&#39;</span>
<span class="sd"> DataFrame that returns the same result as the input, with the following guarantees:</span>
<span class="sd"> * It will compute the defined aggregates (metrics) on all the data that is flowing through</span>
<span class="sd"> the Dataset at that point.</span>
<span class="sd"> * It will report the value of the defined aggregate columns as soon as we reach a completion</span>
<span class="sd"> point. A completion point is either the end of a query (batch mode) or the end of a</span>
<span class="sd"> streaming epoch. The value of the aggregates only reflects the data processed since</span>
<span class="sd"> the previous completion point.</span>
<span class="sd"> The metrics columns must either contain a literal (e.g. lit(42)), or should contain one or</span>
<span class="sd"> more aggregate functions (e.g. sum(a) or sum(a + b) + avg(c) - lit(1)). Expressions that</span>
<span class="sd"> contain references to the input Dataset&#39;s columns must always be wrapped in an aggregate</span>
<span class="sd"> function.</span>
<span class="sd"> A user can observe these metrics by adding</span>
<span class="sd"> Python&#39;s :class:`~pyspark.sql.streaming.StreamingQueryListener`,</span>
<span class="sd"> Scala/Java&#39;s ``org.apache.spark.sql.streaming.StreamingQueryListener`` or Scala/Java&#39;s</span>
<span class="sd"> ``org.apache.spark.sql.util.QueryExecutionListener`` to the spark session.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> .. versionchanged:: 3.5.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> observation : :class:`Observation` or str</span>
<span class="sd"> `str` to specify the name, or an :class:`Observation` instance to obtain the metric.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Added support for `str` in this parameter.</span>
<span class="sd"> exprs : :class:`Column`</span>
<span class="sd"> column expressions (:class:`Column`).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> the observed :class:`DataFrame`.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> When ``observation`` is :class:`Observation`, this method only supports batch queries.</span>
<span class="sd"> When ``observation`` is a string, this method works for both batch and streaming queries.</span>
<span class="sd"> Continuous execution is currently not supported yet.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> When ``observation`` is :class:`Observation`, only batch queries work as below.</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import col, count, lit, max</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Observation</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; observation = Observation(&quot;my metrics&quot;)</span>
<span class="sd"> &gt;&gt;&gt; observed_df = df.observe(observation, count(lit(1)).alias(&quot;count&quot;), max(col(&quot;age&quot;)))</span>
<span class="sd"> &gt;&gt;&gt; observed_df.count()</span>
<span class="sd"> 2</span>
<span class="sd"> &gt;&gt;&gt; observation.get</span>
<span class="sd"> {&#39;count&#39;: 2, &#39;max(age)&#39;: 5}</span>
<span class="sd"> When ``observation`` is a string, streaming queries also work as below.</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.streaming import StreamingQueryListener</span>
<span class="sd"> &gt;&gt;&gt; import time</span>
<span class="sd"> &gt;&gt;&gt; class MyErrorListener(StreamingQueryListener):</span>
<span class="sd"> ... def onQueryStarted(self, event):</span>
<span class="sd"> ... pass</span>
<span class="sd"> ...</span>
<span class="sd"> ... def onQueryProgress(self, event):</span>
<span class="sd"> ... row = event.progress.observedMetrics.get(&quot;my_event&quot;)</span>
<span class="sd"> ... # Trigger if the number of errors exceeds 5 percent</span>
<span class="sd"> ... num_rows = row.rc</span>
<span class="sd"> ... num_error_rows = row.erc</span>
<span class="sd"> ... ratio = num_error_rows / num_rows</span>
<span class="sd"> ... if ratio &gt; 0.05:</span>
<span class="sd"> ... # Trigger alert</span>
<span class="sd"> ... pass</span>
<span class="sd"> ...</span>
<span class="sd"> ... def onQueryIdle(self, event):</span>
<span class="sd"> ... pass</span>
<span class="sd"> ...</span>
<span class="sd"> ... def onQueryTerminated(self, event):</span>
<span class="sd"> ... pass</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; error_listener = MyErrorListener()</span>
<span class="sd"> &gt;&gt;&gt; spark.streams.addListener(error_listener)</span>
<span class="sd"> &gt;&gt;&gt; sdf = spark.readStream.format(&quot;rate&quot;).load().withColumn(</span>
<span class="sd"> ... &quot;error&quot;, col(&quot;value&quot;)</span>
<span class="sd"> ... )</span>
<span class="sd"> &gt;&gt;&gt; # Observe row count (rc) and error row count (erc) in the streaming Dataset</span>
<span class="sd"> ... observed_ds = sdf.observe(</span>
<span class="sd"> ... &quot;my_event&quot;,</span>
<span class="sd"> ... count(lit(1)).alias(&quot;rc&quot;),</span>
<span class="sd"> ... count(col(&quot;error&quot;)).alias(&quot;erc&quot;))</span>
<span class="sd"> &gt;&gt;&gt; try:</span>
<span class="sd"> ... q = observed_ds.writeStream.format(&quot;console&quot;).start()</span>
<span class="sd"> ... time.sleep(5)</span>
<span class="sd"> ...</span>
<span class="sd"> ... finally:</span>
<span class="sd"> ... q.stop()</span>
<span class="sd"> ... spark.streams.removeListener(error_listener)</span>
<span class="sd"> ...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.union"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.union.html#pyspark.sql.DataFrame.union">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">union</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Return a new :class:`DataFrame` containing the union of rows in this and another</span>
<span class="sd"> :class:`DataFrame`.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> other : :class:`DataFrame`</span>
<span class="sd"> Another :class:`DataFrame` that needs to be unioned.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> A new :class:`DataFrame` containing the combined rows with corresponding columns.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.unionAll</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This method performs a SQL-style set union of the rows from both `DataFrame` objects,</span>
<span class="sd"> with no automatic deduplication of elements.</span>
<span class="sd"> Use the `distinct()` method to perform deduplication of rows.</span>
<span class="sd"> The method resolves columns by position (not by name), following the standard behavior</span>
<span class="sd"> in SQL.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Example 1: Combining two DataFrames with the same schema</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(1, &#39;A&#39;), (2, &#39;B&#39;)], [&#39;id&#39;, &#39;value&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(3, &#39;C&#39;), (4, &#39;D&#39;)], [&#39;id&#39;, &#39;value&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df3 = df1.union(df2)</span>
<span class="sd"> &gt;&gt;&gt; df3.show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | id|value|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 1| A|</span>
<span class="sd"> | 2| B|</span>
<span class="sd"> | 3| C|</span>
<span class="sd"> | 4| D|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> Example 2: Combining two DataFrames with different schemas</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import lit</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(100001, 1), (100002, 2)], schema=&quot;id LONG, money INT&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(3, 100003), (4, 100003)], schema=&quot;money INT, id LONG&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df1 = df1.withColumn(&quot;age&quot;, lit(30))</span>
<span class="sd"> &gt;&gt;&gt; df2 = df2.withColumn(&quot;age&quot;, lit(40))</span>
<span class="sd"> &gt;&gt;&gt; df3 = df1.union(df2)</span>
<span class="sd"> &gt;&gt;&gt; df3.show()</span>
<span class="sd"> +------+------+---+</span>
<span class="sd"> | id| money|age|</span>
<span class="sd"> +------+------+---+</span>
<span class="sd"> |100001| 1| 30|</span>
<span class="sd"> |100002| 2| 30|</span>
<span class="sd"> | 3|100003| 40|</span>
<span class="sd"> | 4|100003| 40|</span>
<span class="sd"> +------+------+---+</span>
<span class="sd"> Example 3: Combining two DataFrames with mismatched columns</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(1, 2)], [&quot;A&quot;, &quot;B&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(3, 4)], [&quot;C&quot;, &quot;D&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df3 = df1.union(df2)</span>
<span class="sd"> &gt;&gt;&gt; df3.show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | A| B|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | 1| 2|</span>
<span class="sd"> | 3| 4|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> Example 4: Combining duplicate rows from two different DataFrames</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(1, &#39;A&#39;), (2, &#39;B&#39;), (3, &#39;C&#39;)], [&#39;id&#39;, &#39;value&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(3, &#39;C&#39;), (4, &#39;D&#39;)], [&#39;id&#39;, &#39;value&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df3 = df1.union(df2).distinct().sort(&quot;id&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df3.show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | id|value|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 1| A|</span>
<span class="sd"> | 2| B|</span>
<span class="sd"> | 3| C|</span>
<span class="sd"> | 4| D|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.unionAll"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.unionAll.html#pyspark.sql.DataFrame.unionAll">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">unionAll</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Return a new :class:`DataFrame` containing the union of rows in this and another</span>
<span class="sd"> :class:`DataFrame`.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> other : :class:`DataFrame`</span>
<span class="sd"> Another :class:`DataFrame` that needs to be combined</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> A new :class:`DataFrame` containing combined rows from both dataframes.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This method combines all rows from both `DataFrame` objects with no automatic</span>
<span class="sd"> deduplication of elements.</span>
<span class="sd"> Use the `distinct()` method to perform deduplication of rows.</span>
<span class="sd"> :func:`unionAll` is an alias to :func:`union`</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> DataFrame.union</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.unionByName"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.unionByName.html#pyspark.sql.DataFrame.unionByName">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">unionByName</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">,</span> <span class="n">allowMissingColumns</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a new :class:`DataFrame` containing union of rows in this and another</span>
<span class="sd"> :class:`DataFrame`.</span>
<span class="sd"> This method performs a union operation on both input DataFrames, resolving columns by</span>
<span class="sd"> name (rather than position). When `allowMissingColumns` is True, missing columns will</span>
<span class="sd"> be filled with null.</span>
<span class="sd"> .. versionadded:: 2.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> other : :class:`DataFrame`</span>
<span class="sd"> Another :class:`DataFrame` that needs to be combined.</span>
<span class="sd"> allowMissingColumns : bool, optional, default False</span>
<span class="sd"> Specify whether to allow missing columns.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> A new :class:`DataFrame` containing the combined rows with corresponding</span>
<span class="sd"> columns of the two given DataFrames.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Example 1: Union of two DataFrames with same columns in different order.</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([[1, 2, 3]], [&quot;col0&quot;, &quot;col1&quot;, &quot;col2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([[4, 5, 6]], [&quot;col1&quot;, &quot;col2&quot;, &quot;col0&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df1.unionByName(df2).show()</span>
<span class="sd"> +----+----+----+</span>
<span class="sd"> |col0|col1|col2|</span>
<span class="sd"> +----+----+----+</span>
<span class="sd"> | 1| 2| 3|</span>
<span class="sd"> | 6| 4| 5|</span>
<span class="sd"> +----+----+----+</span>
<span class="sd"> Example 2: Union with missing columns and setting `allowMissingColumns=True`.</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([[1, 2, 3]], [&quot;col0&quot;, &quot;col1&quot;, &quot;col2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([[4, 5, 6]], [&quot;col1&quot;, &quot;col2&quot;, &quot;col3&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df1.unionByName(df2, allowMissingColumns=True).show()</span>
<span class="sd"> +----+----+----+----+</span>
<span class="sd"> |col0|col1|col2|col3|</span>
<span class="sd"> +----+----+----+----+</span>
<span class="sd"> | 1| 2| 3|NULL|</span>
<span class="sd"> |NULL| 4| 5| 6|</span>
<span class="sd"> +----+----+----+----+</span>
<span class="sd"> Example 3: Union of two DataFrames with few common columns.</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([[1, 2, 3]], [&quot;col0&quot;, &quot;col1&quot;, &quot;col2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([[4, 5, 6, 7]], [&quot;col1&quot;, &quot;col2&quot;, &quot;col3&quot;, &quot;col4&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df1.unionByName(df2, allowMissingColumns=True).show()</span>
<span class="sd"> +----+----+----+----+----+</span>
<span class="sd"> |col0|col1|col2|col3|col4|</span>
<span class="sd"> +----+----+----+----+----+</span>
<span class="sd"> | 1| 2| 3|NULL|NULL|</span>
<span class="sd"> |NULL| 4| 5| 6| 7|</span>
<span class="sd"> +----+----+----+----+----+</span>
<span class="sd"> Example 4: Union of two DataFrames with completely different columns.</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([[0, 1, 2]], [&quot;col0&quot;, &quot;col1&quot;, &quot;col2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([[3, 4, 5]], [&quot;col3&quot;, &quot;col4&quot;, &quot;col5&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df1.unionByName(df2, allowMissingColumns=True).show()</span>
<span class="sd"> +----+----+----+----+----+----+</span>
<span class="sd"> |col0|col1|col2|col3|col4|col5|</span>
<span class="sd"> +----+----+----+----+----+----+</span>
<span class="sd"> | 0| 1| 2|NULL|NULL|NULL|</span>
<span class="sd"> |NULL|NULL|NULL| 3| 4| 5|</span>
<span class="sd"> +----+----+----+----+----+----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.intersect"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.intersect.html#pyspark.sql.DataFrame.intersect">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">intersect</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Return a new :class:`DataFrame` containing rows only in</span>
<span class="sd"> both this :class:`DataFrame` and another :class:`DataFrame`.</span>
<span class="sd"> Note that any duplicates are removed. To preserve duplicates</span>
<span class="sd"> use :func:`intersectAll`.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> other : :class:`DataFrame`</span>
<span class="sd"> Another :class:`DataFrame` that needs to be combined.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Combined DataFrame.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This is equivalent to `INTERSECT` in SQL.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Example 1: Intersecting two DataFrames with the same schema</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(&quot;a&quot;, 1), (&quot;a&quot;, 1), (&quot;b&quot;, 3), (&quot;c&quot;, 4)], [&quot;C1&quot;, &quot;C2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(&quot;a&quot;, 1), (&quot;a&quot;, 1), (&quot;b&quot;, 3)], [&quot;C1&quot;, &quot;C2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; result_df = df1.intersect(df2).sort(&quot;C1&quot;, &quot;C2&quot;)</span>
<span class="sd"> &gt;&gt;&gt; result_df.show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | C1| C2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | a| 1|</span>
<span class="sd"> | b| 3|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> Example 2: Intersecting two DataFrames with different schemas</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(1, &quot;A&quot;), (2, &quot;B&quot;)], [&quot;id&quot;, &quot;value&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(2, &quot;B&quot;), (3, &quot;C&quot;)], [&quot;id&quot;, &quot;value&quot;])</span>
<span class="sd"> &gt;&gt;&gt; result_df = df1.intersect(df2).sort(&quot;id&quot;, &quot;value&quot;)</span>
<span class="sd"> &gt;&gt;&gt; result_df.show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | id|value|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 2| B|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> Example 3: Intersecting all rows from two DataFrames with mismatched columns</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(1, 2), (1, 2), (3, 4)], [&quot;A&quot;, &quot;B&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(1, 2), (1, 2)], [&quot;C&quot;, &quot;D&quot;])</span>
<span class="sd"> &gt;&gt;&gt; result_df = df1.intersect(df2).sort(&quot;A&quot;, &quot;B&quot;)</span>
<span class="sd"> &gt;&gt;&gt; result_df.show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | A| B|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | 1| 2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.intersectAll"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.intersectAll.html#pyspark.sql.DataFrame.intersectAll">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">intersectAll</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Return a new :class:`DataFrame` containing rows in both this :class:`DataFrame`</span>
<span class="sd"> and another :class:`DataFrame` while preserving duplicates.</span>
<span class="sd"> This is equivalent to `INTERSECT ALL` in SQL. As standard in SQL, this function</span>
<span class="sd"> resolves columns by position (not by name).</span>
<span class="sd"> .. versionadded:: 2.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> other : :class:`DataFrame`</span>
<span class="sd"> Another :class:`DataFrame` that needs to be combined.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Combined DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Example 1: Intersecting two DataFrames with the same schema</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(&quot;a&quot;, 1), (&quot;a&quot;, 1), (&quot;b&quot;, 3), (&quot;c&quot;, 4)], [&quot;C1&quot;, &quot;C2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(&quot;a&quot;, 1), (&quot;a&quot;, 1), (&quot;b&quot;, 3)], [&quot;C1&quot;, &quot;C2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; result_df = df1.intersectAll(df2).sort(&quot;C1&quot;, &quot;C2&quot;)</span>
<span class="sd"> &gt;&gt;&gt; result_df.show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | C1| C2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | a| 1|</span>
<span class="sd"> | a| 1|</span>
<span class="sd"> | b| 3|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> Example 2: Intersecting two DataFrames with different schemas</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(1, &quot;A&quot;), (2, &quot;B&quot;)], [&quot;id&quot;, &quot;value&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(2, &quot;B&quot;), (3, &quot;C&quot;)], [&quot;id&quot;, &quot;value&quot;])</span>
<span class="sd"> &gt;&gt;&gt; result_df = df1.intersectAll(df2).sort(&quot;id&quot;, &quot;value&quot;)</span>
<span class="sd"> &gt;&gt;&gt; result_df.show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | id|value|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 2| B|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> Example 3: Intersecting all rows from two DataFrames with mismatched columns</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(1, 2), (1, 2), (3, 4)], [&quot;A&quot;, &quot;B&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(1, 2), (1, 2)], [&quot;C&quot;, &quot;D&quot;])</span>
<span class="sd"> &gt;&gt;&gt; result_df = df1.intersectAll(df2).sort(&quot;A&quot;, &quot;B&quot;)</span>
<span class="sd"> &gt;&gt;&gt; result_df.show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | A| B|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | 1| 2|</span>
<span class="sd"> | 1| 2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.subtract"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.subtract.html#pyspark.sql.DataFrame.subtract">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">subtract</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Return a new :class:`DataFrame` containing rows in this :class:`DataFrame`</span>
<span class="sd"> but not in another :class:`DataFrame`.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> other : :class:`DataFrame`</span>
<span class="sd"> Another :class:`DataFrame` that needs to be subtracted.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Subtracted DataFrame.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This is equivalent to `EXCEPT DISTINCT` in SQL.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Example 1: Subtracting two DataFrames with the same schema</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(&quot;a&quot;, 1), (&quot;a&quot;, 1), (&quot;b&quot;, 3), (&quot;c&quot;, 4)], [&quot;C1&quot;, &quot;C2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(&quot;a&quot;, 1), (&quot;a&quot;, 1), (&quot;b&quot;, 3)], [&quot;C1&quot;, &quot;C2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; result_df = df1.subtract(df2)</span>
<span class="sd"> &gt;&gt;&gt; result_df.show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | C1| C2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | c| 4|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> Example 2: Subtracting two DataFrames with different schemas</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(1, &quot;A&quot;), (2, &quot;B&quot;)], [&quot;id&quot;, &quot;value&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(2, &quot;B&quot;), (3, &quot;C&quot;)], [&quot;id&quot;, &quot;value&quot;])</span>
<span class="sd"> &gt;&gt;&gt; result_df = df1.subtract(df2)</span>
<span class="sd"> &gt;&gt;&gt; result_df.show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | id|value|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 1| A|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> Example 3: Subtracting two DataFrames with mismatched columns</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.createDataFrame([(1, 2)], [&quot;A&quot;, &quot;B&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(1, 2)], [&quot;C&quot;, &quot;D&quot;])</span>
<span class="sd"> &gt;&gt;&gt; result_df = df1.subtract(df2)</span>
<span class="sd"> &gt;&gt;&gt; result_df.show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | A| B|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> +---+---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.dropDuplicates"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.dropDuplicates.html#pyspark.sql.DataFrame.dropDuplicates">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">dropDuplicates</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Return a new :class:`DataFrame` with duplicate rows removed,</span>
<span class="sd"> optionally only considering certain columns.</span>
<span class="sd"> For a static batch :class:`DataFrame`, it just drops duplicate rows. For a streaming</span>
<span class="sd"> :class:`DataFrame`, it will keep all data across triggers as intermediate state to drop</span>
<span class="sd"> duplicates rows. You can use :func:`withWatermark` to limit how late the duplicate data can</span>
<span class="sd"> be and the system will accordingly limit the state. In addition, data older than</span>
<span class="sd"> watermark will be dropped to avoid any possibility of duplicates.</span>
<span class="sd"> :func:`drop_duplicates` is an alias for :func:`dropDuplicates`.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> subset : list of column names, optional</span>
<span class="sd"> List of columns to use for duplicate comparison (default All columns).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> DataFrame without duplicates.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... Row(name=&#39;Alice&#39;, age=5, height=80),</span>
<span class="sd"> ... Row(name=&#39;Alice&#39;, age=5, height=80),</span>
<span class="sd"> ... Row(name=&#39;Alice&#39;, age=10, height=80)</span>
<span class="sd"> ... ])</span>
<span class="sd"> Deduplicate the same rows.</span>
<span class="sd"> &gt;&gt;&gt; df.dropDuplicates().show()</span>
<span class="sd"> +-----+---+------+</span>
<span class="sd"> | name|age|height|</span>
<span class="sd"> +-----+---+------+</span>
<span class="sd"> |Alice| 5| 80|</span>
<span class="sd"> |Alice| 10| 80|</span>
<span class="sd"> +-----+---+------+</span>
<span class="sd"> Deduplicate values on &#39;name&#39; and &#39;height&#39; columns.</span>
<span class="sd"> &gt;&gt;&gt; df.dropDuplicates([&#39;name&#39;, &#39;height&#39;]).show()</span>
<span class="sd"> +-----+---+------+</span>
<span class="sd"> | name|age|height|</span>
<span class="sd"> +-----+---+------+</span>
<span class="sd"> |Alice| 5| 80|</span>
<span class="sd"> +-----+---+------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.dropDuplicatesWithinWatermark"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.dropDuplicatesWithinWatermark.html#pyspark.sql.DataFrame.dropDuplicatesWithinWatermark">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">dropDuplicatesWithinWatermark</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Return a new :class:`DataFrame` with duplicate rows removed,</span>
<span class="sd"> optionally only considering certain columns, within watermark.</span>
<span class="sd"> This only works with streaming :class:`DataFrame`, and watermark for the input</span>
<span class="sd"> :class:`DataFrame` must be set via :func:`withWatermark`.</span>
<span class="sd"> For a streaming :class:`DataFrame`, this will keep all data across triggers as intermediate</span>
<span class="sd"> state to drop duplicated rows. The state will be kept to guarantee the semantic, &quot;Events</span>
<span class="sd"> are deduplicated as long as the time distance of earliest and latest events are smaller</span>
<span class="sd"> than the delay threshold of watermark.&quot; Users are encouraged to set the delay threshold of</span>
<span class="sd"> watermark longer than max timestamp differences among duplicated events.</span>
<span class="sd"> Note: too late data older than watermark will be dropped.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> subset : List of column names, optional</span>
<span class="sd"> List of columns to use for duplicate comparison (default All columns).</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> DataFrame without duplicates.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import timestamp_seconds</span>
<span class="sd"> &gt;&gt;&gt; df = spark.readStream.format(&quot;rate&quot;).load().selectExpr(</span>
<span class="sd"> ... &quot;value % 5 AS value&quot;, &quot;timestamp&quot;)</span>
<span class="sd"> &gt;&gt;&gt; df.select(&quot;value&quot;, df.timestamp.alias(&quot;time&quot;)).withWatermark(&quot;time&quot;, &#39;10 minutes&#39;)</span>
<span class="sd"> DataFrame[value: bigint, time: timestamp]</span>
<span class="sd"> Deduplicate the same rows.</span>
<span class="sd"> &gt;&gt;&gt; df.dropDuplicatesWithinWatermark() # doctest: +SKIP</span>
<span class="sd"> Deduplicate values on &#39;value&#39; columns.</span>
<span class="sd"> &gt;&gt;&gt; df.dropDuplicatesWithinWatermark([&#39;value&#39;]) # doctest: +SKIP</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.dropna"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.dropna.html#pyspark.sql.DataFrame.dropna">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">dropna</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">how</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;any&quot;</span><span class="p">,</span>
<span class="n">thresh</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="o">...</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a new :class:`DataFrame` omitting rows with null values.</span>
<span class="sd"> :func:`DataFrame.dropna` and :func:`DataFrameNaFunctions.drop` are</span>
<span class="sd"> aliases of each other.</span>
<span class="sd"> .. versionadded:: 1.3.1</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> how : str, optional, the values that can be &#39;any&#39; or &#39;all&#39;, default &#39;any&#39;.</span>
<span class="sd"> If &#39;any&#39;, drop a row if it contains any nulls.</span>
<span class="sd"> If &#39;all&#39;, drop a row only if all its values are null.</span>
<span class="sd"> thresh: int, optional, default None.</span>
<span class="sd"> If specified, drop rows that have less than `thresh` non-null values.</span>
<span class="sd"> This overwrites the `how` parameter.</span>
<span class="sd"> subset : str, tuple or list, optional</span>
<span class="sd"> optional list of column names to consider.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> DataFrame with null only rows excluded.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... Row(age=10, height=80, name=&quot;Alice&quot;),</span>
<span class="sd"> ... Row(age=5, height=None, name=&quot;Bob&quot;),</span>
<span class="sd"> ... Row(age=None, height=None, name=&quot;Tom&quot;),</span>
<span class="sd"> ... Row(age=None, height=None, name=None),</span>
<span class="sd"> ... ])</span>
<span class="sd"> Example 1: Drop the row if it contains any nulls.</span>
<span class="sd"> &gt;&gt;&gt; df.na.drop().show()</span>
<span class="sd"> +---+------+-----+</span>
<span class="sd"> |age|height| name|</span>
<span class="sd"> +---+------+-----+</span>
<span class="sd"> | 10| 80|Alice|</span>
<span class="sd"> +---+------+-----+</span>
<span class="sd"> Example 2: Drop the row only if all its values are null.</span>
<span class="sd"> &gt;&gt;&gt; df.na.drop(how=&#39;all&#39;).show()</span>
<span class="sd"> +----+------+-----+</span>
<span class="sd"> | age|height| name|</span>
<span class="sd"> +----+------+-----+</span>
<span class="sd"> | 10| 80|Alice|</span>
<span class="sd"> | 5| NULL| Bob|</span>
<span class="sd"> |NULL| NULL| Tom|</span>
<span class="sd"> +----+------+-----+</span>
<span class="sd"> Example 3: Drop rows that have less than `thresh` non-null values.</span>
<span class="sd"> &gt;&gt;&gt; df.na.drop(thresh=2).show()</span>
<span class="sd"> +---+------+-----+</span>
<span class="sd"> |age|height| name|</span>
<span class="sd"> +---+------+-----+</span>
<span class="sd"> | 10| 80|Alice|</span>
<span class="sd"> | 5| NULL| Bob|</span>
<span class="sd"> +---+------+-----+</span>
<span class="sd"> Example 4: Drop rows with non-null values in the specified columns.</span>
<span class="sd"> &gt;&gt;&gt; df.na.drop(subset=[&#39;age&#39;, &#39;name&#39;]).show()</span>
<span class="sd"> +---+------+-----+</span>
<span class="sd"> |age|height| name|</span>
<span class="sd"> +---+------+-----+</span>
<span class="sd"> | 10| 80|Alice|</span>
<span class="sd"> | 5| NULL| Bob|</span>
<span class="sd"> +---+------+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">fillna</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">value</span><span class="p">:</span> <span class="s2">&quot;LiteralType&quot;</span><span class="p">,</span>
<span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="o">...</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">fillna</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="s2">&quot;LiteralType&quot;</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.fillna"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.fillna.html#pyspark.sql.DataFrame.fillna">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">fillna</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">value</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;LiteralType&quot;</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="s2">&quot;LiteralType&quot;</span><span class="p">]],</span>
<span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="o">...</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a new :class:`DataFrame` which null values are filled with new value.</span>
<span class="sd"> :func:`DataFrame.fillna` and :func:`DataFrameNaFunctions.fill` are</span>
<span class="sd"> aliases of each other.</span>
<span class="sd"> .. versionadded:: 1.3.1</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> value : int, float, string, bool or dict, the value to replace null values with.</span>
<span class="sd"> If the value is a dict, then `subset` is ignored and `value` must be a mapping</span>
<span class="sd"> from column name (string) to replacement value. The replacement value must be</span>
<span class="sd"> an int, float, boolean, or string.</span>
<span class="sd"> subset : str, tuple or list, optional</span>
<span class="sd"> optional list of column names to consider.</span>
<span class="sd"> Columns specified in subset that do not have matching data types are ignored.</span>
<span class="sd"> For example, if `value` is a string, and subset contains a non-string column,</span>
<span class="sd"> then the non-string column is simply ignored.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> DataFrame with replaced null values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (10, 80.5, &quot;Alice&quot;, None),</span>
<span class="sd"> ... (5, None, &quot;Bob&quot;, None),</span>
<span class="sd"> ... (None, None, &quot;Tom&quot;, None),</span>
<span class="sd"> ... (None, None, None, True)],</span>
<span class="sd"> ... schema=[&quot;age&quot;, &quot;height&quot;, &quot;name&quot;, &quot;bool&quot;])</span>
<span class="sd"> Example 1: Fill all null values with 50 for numeric columns.</span>
<span class="sd"> &gt;&gt;&gt; df.na.fill(50).show()</span>
<span class="sd"> +---+------+-----+----+</span>
<span class="sd"> |age|height| name|bool|</span>
<span class="sd"> +---+------+-----+----+</span>
<span class="sd"> | 10| 80.5|Alice|NULL|</span>
<span class="sd"> | 5| 50.0| Bob|NULL|</span>
<span class="sd"> | 50| 50.0| Tom|NULL|</span>
<span class="sd"> | 50| 50.0| NULL|true|</span>
<span class="sd"> +---+------+-----+----+</span>
<span class="sd"> Example 2: Fill all null values with ``False`` for boolean columns.</span>
<span class="sd"> &gt;&gt;&gt; df.na.fill(False).show()</span>
<span class="sd"> +----+------+-----+-----+</span>
<span class="sd"> | age|height| name| bool|</span>
<span class="sd"> +----+------+-----+-----+</span>
<span class="sd"> | 10| 80.5|Alice|false|</span>
<span class="sd"> | 5| NULL| Bob|false|</span>
<span class="sd"> |NULL| NULL| Tom|false|</span>
<span class="sd"> |NULL| NULL| NULL| true|</span>
<span class="sd"> +----+------+-----+-----+</span>
<span class="sd"> Example 3: Fill all null values with to 50 and &quot;unknown&quot; for</span>
<span class="sd"> &#39;age&#39; and &#39;name&#39; column respectively.</span>
<span class="sd"> &gt;&gt;&gt; df.na.fill({&#39;age&#39;: 50, &#39;name&#39;: &#39;unknown&#39;}).show()</span>
<span class="sd"> +---+------+-------+----+</span>
<span class="sd"> |age|height| name|bool|</span>
<span class="sd"> +---+------+-------+----+</span>
<span class="sd"> | 10| 80.5| Alice|NULL|</span>
<span class="sd"> | 5| NULL| Bob|NULL|</span>
<span class="sd"> | 50| NULL| Tom|NULL|</span>
<span class="sd"> | 50| NULL|unknown|true|</span>
<span class="sd"> +---+------+-------+----+</span>
<span class="sd"> Example 4: Fill all null values with &quot;Spark&quot; for &#39;name&#39; column.</span>
<span class="sd"> &gt;&gt;&gt; df.na.fill(value = &#39;Spark&#39;, subset = &#39;name&#39;).show()</span>
<span class="sd"> +----+------+-----+----+</span>
<span class="sd"> | age|height| name|bool|</span>
<span class="sd"> +----+------+-----+----+</span>
<span class="sd"> | 10| 80.5|Alice|NULL|</span>
<span class="sd"> | 5| NULL| Bob|NULL|</span>
<span class="sd"> |NULL| NULL| Tom|NULL|</span>
<span class="sd"> |NULL| NULL|Spark|true|</span>
<span class="sd"> +----+------+-----+----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">replace</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">to_replace</span><span class="p">:</span> <span class="s2">&quot;LiteralType&quot;</span><span class="p">,</span>
<span class="n">value</span><span class="p">:</span> <span class="s2">&quot;OptionalPrimitiveType&quot;</span><span class="p">,</span>
<span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">replace</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">to_replace</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;LiteralType&quot;</span><span class="p">],</span>
<span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;OptionalPrimitiveType&quot;</span><span class="p">],</span>
<span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">replace</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">to_replace</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="s2">&quot;LiteralType&quot;</span><span class="p">,</span> <span class="s2">&quot;OptionalPrimitiveType&quot;</span><span class="p">],</span>
<span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">replace</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">to_replace</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;LiteralType&quot;</span><span class="p">],</span>
<span class="n">value</span><span class="p">:</span> <span class="s2">&quot;OptionalPrimitiveType&quot;</span><span class="p">,</span>
<span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.replace"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.replace.html#pyspark.sql.DataFrame.replace">[docs]</a> <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span>
<span class="k">def</span> <span class="nf">replace</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">to_replace</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span>
<span class="s2">&quot;LiteralType&quot;</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;LiteralType&quot;</span><span class="p">],</span> <span class="n">Dict</span><span class="p">[</span><span class="s2">&quot;LiteralType&quot;</span><span class="p">,</span> <span class="s2">&quot;OptionalPrimitiveType&quot;</span><span class="p">]</span>
<span class="p">],</span>
<span class="n">value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span>
<span class="n">Union</span><span class="p">[</span><span class="s2">&quot;OptionalPrimitiveType&quot;</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;OptionalPrimitiveType&quot;</span><span class="p">],</span> <span class="n">_NoValueType</span><span class="p">]</span>
<span class="p">]</span> <span class="o">=</span> <span class="n">_NoValue</span><span class="p">,</span>
<span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a new :class:`DataFrame` replacing a value with another value.</span>
<span class="sd"> :func:`DataFrame.replace` and :func:`DataFrameNaFunctions.replace` are</span>
<span class="sd"> aliases of each other.</span>
<span class="sd"> Values to_replace and value must have the same type and can only be numerics, booleans,</span>
<span class="sd"> or strings. Value can have None. When replacing, the new value will be cast</span>
<span class="sd"> to the type of the existing column.</span>
<span class="sd"> For numeric replacements all values to be replaced should have unique</span>
<span class="sd"> floating point representation. In case of conflicts (for example with `{42: -1, 42.0: 1}`)</span>
<span class="sd"> and arbitrary replacement will be used.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> to_replace : bool, int, float, string, list or dict, the value to be replaced.</span>
<span class="sd"> If the value is a dict, then `value` is ignored or can be omitted, and `to_replace`</span>
<span class="sd"> must be a mapping between a value and a replacement.</span>
<span class="sd"> value : bool, int, float, string or None, optional</span>
<span class="sd"> The replacement value must be a bool, int, float, string or None. If `value` is a</span>
<span class="sd"> list, `value` should be of the same length and type as `to_replace`.</span>
<span class="sd"> If `value` is a scalar and `to_replace` is a sequence, then `value` is</span>
<span class="sd"> used as a replacement for each item in `to_replace`.</span>
<span class="sd"> subset : list, optional</span>
<span class="sd"> optional list of column names to consider.</span>
<span class="sd"> Columns specified in subset that do not have matching data types are ignored.</span>
<span class="sd"> For example, if `value` is a string, and subset contains a non-string column,</span>
<span class="sd"> then the non-string column is simply ignored.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> DataFrame with replaced values.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([</span>
<span class="sd"> ... (10, 80, &quot;Alice&quot;),</span>
<span class="sd"> ... (5, None, &quot;Bob&quot;),</span>
<span class="sd"> ... (None, 10, &quot;Tom&quot;),</span>
<span class="sd"> ... (None, None, None)],</span>
<span class="sd"> ... schema=[&quot;age&quot;, &quot;height&quot;, &quot;name&quot;])</span>
<span class="sd"> Example 1: Replace 10 to 20 in all columns.</span>
<span class="sd"> &gt;&gt;&gt; df.na.replace(10, 20).show()</span>
<span class="sd"> +----+------+-----+</span>
<span class="sd"> | age|height| name|</span>
<span class="sd"> +----+------+-----+</span>
<span class="sd"> | 20| 80|Alice|</span>
<span class="sd"> | 5| NULL| Bob|</span>
<span class="sd"> |NULL| 20| Tom|</span>
<span class="sd"> |NULL| NULL| NULL|</span>
<span class="sd"> +----+------+-----+</span>
<span class="sd"> Example 2: Replace &#39;Alice&#39; to null in all columns.</span>
<span class="sd"> &gt;&gt;&gt; df.na.replace(&#39;Alice&#39;, None).show()</span>
<span class="sd"> +----+------+----+</span>
<span class="sd"> | age|height|name|</span>
<span class="sd"> +----+------+----+</span>
<span class="sd"> | 10| 80|NULL|</span>
<span class="sd"> | 5| NULL| Bob|</span>
<span class="sd"> |NULL| 10| Tom|</span>
<span class="sd"> |NULL| NULL|NULL|</span>
<span class="sd"> +----+------+----+</span>
<span class="sd"> Example 3: Replace &#39;Alice&#39; to &#39;A&#39;, and &#39;Bob&#39; to &#39;B&#39; in the &#39;name&#39; column.</span>
<span class="sd"> &gt;&gt;&gt; df.na.replace([&#39;Alice&#39;, &#39;Bob&#39;], [&#39;A&#39;, &#39;B&#39;], &#39;name&#39;).show()</span>
<span class="sd"> +----+------+----+</span>
<span class="sd"> | age|height|name|</span>
<span class="sd"> +----+------+----+</span>
<span class="sd"> | 10| 80| A|</span>
<span class="sd"> | 5| NULL| B|</span>
<span class="sd"> |NULL| 10| Tom|</span>
<span class="sd"> |NULL| NULL|NULL|</span>
<span class="sd"> +----+------+----+</span>
<span class="sd"> Example 4: Replace 10 to 20 in the &#39;name&#39; column.</span>
<span class="sd"> &gt;&gt;&gt; df.na.replace(10, 18, &#39;age&#39;).show()</span>
<span class="sd"> +----+------+-----+</span>
<span class="sd"> | age|height| name|</span>
<span class="sd"> +----+------+-----+</span>
<span class="sd"> | 18| 80|Alice|</span>
<span class="sd"> | 5| NULL| Bob|</span>
<span class="sd"> |NULL| 10| Tom|</span>
<span class="sd"> |NULL| NULL| NULL|</span>
<span class="sd"> +----+------+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">approxQuantile</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">col</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">probabilities</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">float</span><span class="p">]],</span>
<span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">approxQuantile</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">col</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">]],</span>
<span class="n">probabilities</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">float</span><span class="p">]],</span>
<span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]:</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.approxQuantile"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.approxQuantile.html#pyspark.sql.DataFrame.approxQuantile">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">approxQuantile</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">col</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">]],</span>
<span class="n">probabilities</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">float</span><span class="p">]],</span>
<span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Calculates the approximate quantiles of numerical columns of a</span>
<span class="sd"> :class:`DataFrame`.</span>
<span class="sd"> The result of this algorithm has the following deterministic bound:</span>
<span class="sd"> If the :class:`DataFrame` has N elements and if we request the quantile at</span>
<span class="sd"> probability `p` up to error `err`, then the algorithm will return</span>
<span class="sd"> a sample `x` from the :class:`DataFrame` so that the *exact* rank of `x` is</span>
<span class="sd"> close to (p * N). More precisely,</span>
<span class="sd"> floor((p - err) * N) &lt;= rank(x) &lt;= ceil((p + err) * N).</span>
<span class="sd"> This method implements a variation of the Greenwald-Khanna</span>
<span class="sd"> algorithm (with some speed optimizations). The algorithm was first</span>
<span class="sd"> present in [[https://doi.org/10.1145/375663.375670</span>
<span class="sd"> Space-efficient Online Computation of Quantile Summaries]]</span>
<span class="sd"> by Greenwald and Khanna.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col: str, tuple or list</span>
<span class="sd"> Can be a single column name, or a list of names for multiple columns.</span>
<span class="sd"> .. versionchanged:: 2.2.0</span>
<span class="sd"> Added support for multiple columns.</span>
<span class="sd"> probabilities : list or tuple of floats</span>
<span class="sd"> a list of quantile probabilities</span>
<span class="sd"> Each number must be a float in the range [0, 1].</span>
<span class="sd"> For example 0.0 is the minimum, 0.5 is the median, 1.0 is the maximum.</span>
<span class="sd"> relativeError : float</span>
<span class="sd"> The relative target precision to achieve</span>
<span class="sd"> (&gt;= 0). If set to zero, the exact quantiles are computed, which</span>
<span class="sd"> could be very expensive. Note that values greater than 1 are</span>
<span class="sd"> accepted but gives the same result as 1.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> list</span>
<span class="sd"> the approximate quantiles at the given probabilities.</span>
<span class="sd"> * If the input `col` is a string, the output is a list of floats.</span>
<span class="sd"> * If the input `col` is a list or tuple of strings, the output is also a</span>
<span class="sd"> list, but each element in it is a list of floats, i.e., the output</span>
<span class="sd"> is a list of list of floats.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Null values will be ignored in numerical columns before calculation.</span>
<span class="sd"> For columns only containing null values, an empty list is returned.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Example 1: Calculating quantiles for a single column</span>
<span class="sd"> &gt;&gt;&gt; data = [(1,), (2,), (3,), (4,), (5,)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, [&quot;values&quot;])</span>
<span class="sd"> &gt;&gt;&gt; quantiles = df.approxQuantile(&quot;values&quot;, [0.0, 0.5, 1.0], 0.05)</span>
<span class="sd"> &gt;&gt;&gt; quantiles</span>
<span class="sd"> [1.0, 3.0, 5.0]</span>
<span class="sd"> Example 2: Calculating quantiles for multiple columns</span>
<span class="sd"> &gt;&gt;&gt; data = [(1, 10), (2, 20), (3, 30), (4, 40), (5, 50)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, [&quot;col1&quot;, &quot;col2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; quantiles = df.approxQuantile([&quot;col1&quot;, &quot;col2&quot;], [0.0, 0.5, 1.0], 0.05)</span>
<span class="sd"> &gt;&gt;&gt; quantiles</span>
<span class="sd"> [[1.0, 3.0, 5.0], [10.0, 30.0, 50.0]]</span>
<span class="sd"> Example 3: Handling null values</span>
<span class="sd"> &gt;&gt;&gt; data = [(1,), (None,), (3,), (4,), (None,)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, [&quot;values&quot;])</span>
<span class="sd"> &gt;&gt;&gt; quantiles = df.approxQuantile(&quot;values&quot;, [0.0, 0.5, 1.0], 0.05)</span>
<span class="sd"> &gt;&gt;&gt; quantiles</span>
<span class="sd"> [1.0, 3.0, 4.0]</span>
<span class="sd"> Example 4: Calculating quantiles with low precision</span>
<span class="sd"> &gt;&gt;&gt; data = [(1,), (2,), (3,), (4,), (5,)]</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(data, [&quot;values&quot;])</span>
<span class="sd"> &gt;&gt;&gt; quantiles = df.approxQuantile(&quot;values&quot;, [0.0, 0.2, 1.0], 0.1)</span>
<span class="sd"> &gt;&gt;&gt; quantiles</span>
<span class="sd"> [1.0, 1.0, 5.0]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.corr"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.corr.html#pyspark.sql.DataFrame.corr">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">corr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col1</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">method</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Calculates the correlation of two columns of a :class:`DataFrame` as a double value.</span>
<span class="sd"> Currently only supports the Pearson Correlation Coefficient.</span>
<span class="sd"> :func:`DataFrame.corr` and :func:`DataFrameStatFunctions.corr` are aliases of each other.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : str</span>
<span class="sd"> The name of the first column</span>
<span class="sd"> col2 : str</span>
<span class="sd"> The name of the second column</span>
<span class="sd"> method : str, optional</span>
<span class="sd"> The correlation method. Currently only supports &quot;pearson&quot;</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> float</span>
<span class="sd"> Pearson Correlation Coefficient of two columns.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, 12), (10, 1), (19, 8)], [&quot;c1&quot;, &quot;c2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.corr(&quot;c1&quot;, &quot;c2&quot;)</span>
<span class="sd"> -0.3592106040535498</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(11, 12), (10, 11), (9, 10)], [&quot;small&quot;, &quot;bigger&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.corr(&quot;small&quot;, &quot;bigger&quot;)</span>
<span class="sd"> 1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.cov"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.cov.html#pyspark.sql.DataFrame.cov">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">cov</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col1</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Calculate the sample covariance for the given columns, specified by their names, as a</span>
<span class="sd"> double value. :func:`DataFrame.cov` and :func:`DataFrameStatFunctions.cov` are aliases.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : str</span>
<span class="sd"> The name of the first column</span>
<span class="sd"> col2 : str</span>
<span class="sd"> The name of the second column</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> float</span>
<span class="sd"> Covariance of two columns.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, 12), (10, 1), (19, 8)], [&quot;c1&quot;, &quot;c2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.cov(&quot;c1&quot;, &quot;c2&quot;)</span>
<span class="sd"> -18.0</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(11, 12), (10, 11), (9, 10)], [&quot;small&quot;, &quot;bigger&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.cov(&quot;small&quot;, &quot;bigger&quot;)</span>
<span class="sd"> 1.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.crosstab"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.crosstab.html#pyspark.sql.DataFrame.crosstab">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">crosstab</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col1</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Computes a pair-wise frequency table of the given columns. Also known as a contingency</span>
<span class="sd"> table.</span>
<span class="sd"> The first column of each row will be the distinct values of `col1` and the column names</span>
<span class="sd"> will be the distinct values of `col2`. The name of the first column will be `$col1_$col2`.</span>
<span class="sd"> Pairs that have no occurrences will have zero as their counts.</span>
<span class="sd"> :func:`DataFrame.crosstab` and :func:`DataFrameStatFunctions.crosstab` are aliases.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> col1 : str</span>
<span class="sd"> The name of the first column. Distinct items will make the first item of</span>
<span class="sd"> each row.</span>
<span class="sd"> col2 : str</span>
<span class="sd"> The name of the second column. Distinct items will make the column names</span>
<span class="sd"> of the :class:`DataFrame`.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Frequency matrix of two columns.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, 11), (1, 11), (3, 10), (4, 8), (4, 8)], [&quot;c1&quot;, &quot;c2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.crosstab(&quot;c1&quot;, &quot;c2&quot;).sort(&quot;c1_c2&quot;).show()</span>
<span class="sd"> +-----+---+---+---+</span>
<span class="sd"> |c1_c2| 10| 11| 8|</span>
<span class="sd"> +-----+---+---+---+</span>
<span class="sd"> | 1| 0| 2| 0|</span>
<span class="sd"> | 3| 1| 0| 0|</span>
<span class="sd"> | 4| 0| 0| 2|</span>
<span class="sd"> +-----+---+---+---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.freqItems"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.freqItems.html#pyspark.sql.DataFrame.freqItems">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">freqItems</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">]],</span> <span class="n">support</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Finding frequent items for columns, possibly with false positives. Using the</span>
<span class="sd"> frequent element count algorithm described in</span>
<span class="sd"> &quot;https://doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou&quot;.</span>
<span class="sd"> :func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols : list or tuple</span>
<span class="sd"> Names of the columns to calculate frequent items for as a list or tuple of</span>
<span class="sd"> strings.</span>
<span class="sd"> support : float, optional</span>
<span class="sd"> The frequency with which to consider an item &#39;frequent&#39;. Default is 1%.</span>
<span class="sd"> The support must be greater than 1e-4.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> DataFrame with frequent items.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This function is meant for exploratory data analysis, as we make no</span>
<span class="sd"> guarantee about the backward compatibility of the schema of the resulting</span>
<span class="sd"> :class:`DataFrame`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, 11), (1, 11), (3, 10), (4, 8), (4, 8)], [&quot;c1&quot;, &quot;c2&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.freqItems([&quot;c1&quot;, &quot;c2&quot;]).show() # doctest: +SKIP</span>
<span class="sd"> +------------+------------+</span>
<span class="sd"> |c1_freqItems|c2_freqItems|</span>
<span class="sd"> +------------+------------+</span>
<span class="sd"> | [4, 1, 3]| [8, 11, 10]|</span>
<span class="sd"> +------------+------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">_ipython_key_completions_</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the names of columns in this :class:`DataFrame`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df._ipython_key_completions_()</span>
<span class="sd"> [&#39;age&#39;, &#39;name&#39;]</span>
<span class="sd"> Would return illegal identifiers.</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], [&quot;age 1&quot;, &quot;name?1&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df._ipython_key_completions_()</span>
<span class="sd"> [&#39;age 1&#39;, &#39;name?1&#39;]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.withColumns"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumns.html#pyspark.sql.DataFrame.withColumns">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">withColumns</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">colsMap</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a new :class:`DataFrame` by adding multiple columns or replacing the</span>
<span class="sd"> existing columns that have the same names.</span>
<span class="sd"> The colsMap is a map of column name and column, the column must only refer to attributes</span>
<span class="sd"> supplied by this Dataset. It is an error to add columns that refer to some other Dataset.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> Added support for multiple columns adding</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> colsMap : dict</span>
<span class="sd"> a dict of column name and :class:`Column`. Currently, only a single map is supported.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> DataFrame with new or replaced columns.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.withColumns({&#39;age2&#39;: df.age + 2, &#39;age3&#39;: df.age + 3}).show()</span>
<span class="sd"> +---+-----+----+----+</span>
<span class="sd"> |age| name|age2|age3|</span>
<span class="sd"> +---+-----+----+----+</span>
<span class="sd"> | 2|Alice| 4| 5|</span>
<span class="sd"> | 5| Bob| 7| 8|</span>
<span class="sd"> +---+-----+----+----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.withColumn"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumn.html#pyspark.sql.DataFrame.withColumn">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">withColumn</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">colName</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a new :class:`DataFrame` by adding a column or replacing the</span>
<span class="sd"> existing column that has the same name.</span>
<span class="sd"> The column expression must be an expression over this :class:`DataFrame`; attempting to add</span>
<span class="sd"> a column from some other :class:`DataFrame` will raise an error.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> colName : str</span>
<span class="sd"> string, name of the new column.</span>
<span class="sd"> col : :class:`Column`</span>
<span class="sd"> a :class:`Column` expression for the new column.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> DataFrame with new or replaced column.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This method introduces a projection internally. Therefore, calling it multiple</span>
<span class="sd"> times, for instance, via loops in order to add multiple columns can generate big</span>
<span class="sd"> plans which can cause performance issues and even `StackOverflowException`.</span>
<span class="sd"> To avoid this, use :func:`select` with multiple columns at once.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.withColumn(&#39;age2&#39;, df.age + 2).show()</span>
<span class="sd"> +---+-----+----+</span>
<span class="sd"> |age| name|age2|</span>
<span class="sd"> +---+-----+----+</span>
<span class="sd"> | 2|Alice| 4|</span>
<span class="sd"> | 5| Bob| 7|</span>
<span class="sd"> +---+-----+----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.withColumnRenamed"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumnRenamed.html#pyspark.sql.DataFrame.withColumnRenamed">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">withColumnRenamed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">existing</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">new</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a new :class:`DataFrame` by renaming an existing column.</span>
<span class="sd"> This is a no-op if the schema doesn&#39;t contain the given column name.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> existing : str</span>
<span class="sd"> The name of the existing column to be renamed.</span>
<span class="sd"> new : str</span>
<span class="sd"> The new name to be assigned to the column.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> A new DataFrame with renamed column.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> :meth:`withColumnsRenamed`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> Example 1: Rename a single column</span>
<span class="sd"> &gt;&gt;&gt; df.withColumnRenamed(&quot;age&quot;, &quot;age2&quot;).show()</span>
<span class="sd"> +----+-----+</span>
<span class="sd"> |age2| name|</span>
<span class="sd"> +----+-----+</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> +----+-----+</span>
<span class="sd"> Example 2: Rename a column that does not exist (no-op)</span>
<span class="sd"> &gt;&gt;&gt; df.withColumnRenamed(&quot;non_existing&quot;, &quot;new_name&quot;).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> Example 3: Rename multiple columns</span>
<span class="sd"> &gt;&gt;&gt; df.withColumnRenamed(&quot;age&quot;, &quot;age2&quot;).withColumnRenamed(&quot;name&quot;, &quot;name2&quot;).show()</span>
<span class="sd"> +----+-----+</span>
<span class="sd"> |age2|name2|</span>
<span class="sd"> +----+-----+</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> +----+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.withColumnsRenamed"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumnsRenamed.html#pyspark.sql.DataFrame.withColumnsRenamed">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">withColumnsRenamed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">colsMap</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a new :class:`DataFrame` by renaming multiple columns.</span>
<span class="sd"> This is a no-op if the schema doesn&#39;t contain the given column names.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Added support for multiple columns renaming</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> colsMap : dict</span>
<span class="sd"> A dict of existing column names and corresponding desired column names.</span>
<span class="sd"> Currently, only a single map is supported.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> DataFrame with renamed columns.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> :meth:`withColumnRenamed`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> Example 1: Rename a single column</span>
<span class="sd"> &gt;&gt;&gt; df.withColumnsRenamed({&quot;age&quot;: &quot;age2&quot;}).show()</span>
<span class="sd"> +----+-----+</span>
<span class="sd"> |age2| name|</span>
<span class="sd"> +----+-----+</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> +----+-----+</span>
<span class="sd"> Example 2: Rename multiple columns</span>
<span class="sd"> &gt;&gt;&gt; df.withColumnsRenamed({&quot;age&quot;: &quot;age2&quot;, &quot;name&quot;: &quot;name2&quot;}).show()</span>
<span class="sd"> +----+-----+</span>
<span class="sd"> |age2|name2|</span>
<span class="sd"> +----+-----+</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> +----+-----+</span>
<span class="sd"> Example 3: Rename non-existing column (no-op)</span>
<span class="sd"> &gt;&gt;&gt; df.withColumnsRenamed({&quot;non_existing&quot;: &quot;new_name&quot;}).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> Example 4: Rename with an empty dictionary (no-op)</span>
<span class="sd"> &gt;&gt;&gt; df.withColumnsRenamed({}).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 2|Alice|</span>
<span class="sd"> | 5| Bob|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.withMetadata"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.withMetadata.html#pyspark.sql.DataFrame.withMetadata">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">withMetadata</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">columnName</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">metadata</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a new :class:`DataFrame` by updating an existing column with metadata.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> columnName : str</span>
<span class="sd"> string, name of the existing column to update the metadata.</span>
<span class="sd"> metadata : dict</span>
<span class="sd"> dict, new metadata to be assigned to df.schema[columnName].metadata</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> DataFrame with updated metadata column.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(2, &quot;Alice&quot;), (5, &quot;Bob&quot;)], schema=[&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df_meta = df.withMetadata(&#39;age&#39;, {&#39;foo&#39;: &#39;bar&#39;})</span>
<span class="sd"> &gt;&gt;&gt; df_meta.schema[&#39;age&#39;].metadata</span>
<span class="sd"> {&#39;foo&#39;: &#39;bar&#39;}</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">drop</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">drop</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.drop"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.drop.html#pyspark.sql.DataFrame.drop">[docs]</a> <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span>
<span class="k">def</span> <span class="nf">drop</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a new :class:`DataFrame` without specified columns.</span>
<span class="sd"> This is a no-op if the schema doesn&#39;t contain the given column name(s).</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> cols: str or :class:`Column`</span>
<span class="sd"> A name of the column, or the :class:`Column` to be dropped.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> A new :class:`DataFrame` without the specified columns.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> - When an input is a column name, it is treated literally without further interpretation.</span>
<span class="sd"> Otherwise, it will try to match the equivalent expression.</span>
<span class="sd"> So dropping a column by its name `drop(colName)` has a different semantic</span>
<span class="sd"> with directly dropping the column `drop(col(colName))`.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Example 1: Drop a column by name.</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.drop(&#39;age&#39;).show()</span>
<span class="sd"> +-----+</span>
<span class="sd"> | name|</span>
<span class="sd"> +-----+</span>
<span class="sd"> | Tom|</span>
<span class="sd"> |Alice|</span>
<span class="sd"> | Bob|</span>
<span class="sd"> +-----+</span>
<span class="sd"> Example 2: Drop a column by :class:`Column` object.</span>
<span class="sd"> &gt;&gt;&gt; df.drop(df.age).show()</span>
<span class="sd"> +-----+</span>
<span class="sd"> | name|</span>
<span class="sd"> +-----+</span>
<span class="sd"> | Tom|</span>
<span class="sd"> |Alice|</span>
<span class="sd"> | Bob|</span>
<span class="sd"> +-----+</span>
<span class="sd"> Example 3: Drop the column that joined both DataFrames on.</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.createDataFrame([(80, &quot;Tom&quot;), (85, &quot;Bob&quot;)], [&quot;height&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.join(df2, df.name == df2.name).drop(&#39;name&#39;).sort(&#39;age&#39;).show()</span>
<span class="sd"> +---+------+</span>
<span class="sd"> |age|height|</span>
<span class="sd"> +---+------+</span>
<span class="sd"> | 14| 80|</span>
<span class="sd"> | 16| 85|</span>
<span class="sd"> +---+------+</span>
<span class="sd"> &gt;&gt;&gt; df3 = df.join(df2)</span>
<span class="sd"> &gt;&gt;&gt; df3.show()</span>
<span class="sd"> +---+-----+------+----+</span>
<span class="sd"> |age| name|height|name|</span>
<span class="sd"> +---+-----+------+----+</span>
<span class="sd"> | 14| Tom| 80| Tom|</span>
<span class="sd"> | 14| Tom| 85| Bob|</span>
<span class="sd"> | 23|Alice| 80| Tom|</span>
<span class="sd"> | 23|Alice| 85| Bob|</span>
<span class="sd"> | 16| Bob| 80| Tom|</span>
<span class="sd"> | 16| Bob| 85| Bob|</span>
<span class="sd"> +---+-----+------+----+</span>
<span class="sd"> Example 4: Drop two column by the same name.</span>
<span class="sd"> &gt;&gt;&gt; df3.drop(&quot;name&quot;).show()</span>
<span class="sd"> +---+------+</span>
<span class="sd"> |age|height|</span>
<span class="sd"> +---+------+</span>
<span class="sd"> | 14| 80|</span>
<span class="sd"> | 14| 85|</span>
<span class="sd"> | 23| 80|</span>
<span class="sd"> | 23| 85|</span>
<span class="sd"> | 16| 80|</span>
<span class="sd"> | 16| 85|</span>
<span class="sd"> +---+------+</span>
<span class="sd"> Example 5: Can not drop col(&#39;name&#39;) due to ambiguous reference.</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import functions as sf</span>
<span class="sd"> &gt;&gt;&gt; df3.drop(sf.col(&quot;name&quot;)).show()</span>
<span class="sd"> Traceback (most recent call last):</span>
<span class="sd"> ...</span>
<span class="sd"> pyspark.errors.exceptions.captured.AnalysisException: [AMBIGUOUS_REFERENCE] Reference...</span>
<span class="sd"> Example 6: Can not find a column matching the expression &quot;a.b.c&quot;.</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import functions as sf</span>
<span class="sd"> &gt;&gt;&gt; df4 = df.withColumn(&quot;a.b.c&quot;, sf.lit(1))</span>
<span class="sd"> &gt;&gt;&gt; df4.show()</span>
<span class="sd"> +---+-----+-----+</span>
<span class="sd"> |age| name|a.b.c|</span>
<span class="sd"> +---+-----+-----+</span>
<span class="sd"> | 14| Tom| 1|</span>
<span class="sd"> | 23|Alice| 1|</span>
<span class="sd"> | 16| Bob| 1|</span>
<span class="sd"> +---+-----+-----+</span>
<span class="sd"> &gt;&gt;&gt; df4.drop(&quot;a.b.c&quot;).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 14| Tom|</span>
<span class="sd"> | 23|Alice|</span>
<span class="sd"> | 16| Bob|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &gt;&gt;&gt; df4.drop(sf.col(&quot;a.b.c&quot;)).show()</span>
<span class="sd"> +---+-----+-----+</span>
<span class="sd"> |age| name|a.b.c|</span>
<span class="sd"> +---+-----+-----+</span>
<span class="sd"> | 14| Tom| 1|</span>
<span class="sd"> | 23|Alice| 1|</span>
<span class="sd"> | 16| Bob| 1|</span>
<span class="sd"> +---+-----+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.toDF"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.toDF.html#pyspark.sql.DataFrame.toDF">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">toDF</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a new :class:`DataFrame` that with new specified column names</span>
<span class="sd"> .. versionadded:: 1.6.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> *cols : tuple</span>
<span class="sd"> a tuple of string new column name. The length of the</span>
<span class="sd"> list needs to be the same as the number of columns in the initial</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> DataFrame with new column names.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(14, &quot;Tom&quot;), (23, &quot;Alice&quot;),</span>
<span class="sd"> ... (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.toDF(&#39;f1&#39;, &#39;f2&#39;).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | f1| f2|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 14| Tom|</span>
<span class="sd"> | 23|Alice|</span>
<span class="sd"> | 16| Bob|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.transform"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.transform.html#pyspark.sql.DataFrame.transform">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">],</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a new :class:`DataFrame`. Concise syntax for chaining custom transformations.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> func : function</span>
<span class="sd"> a function that takes and returns a :class:`DataFrame`.</span>
<span class="sd"> *args</span>
<span class="sd"> Positional arguments to pass to func.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> **kwargs</span>
<span class="sd"> Keyword arguments to pass to func.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Transformed DataFrame.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import col</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, 1.0), (2, 2.0)], [&quot;int&quot;, &quot;float&quot;])</span>
<span class="sd"> &gt;&gt;&gt; def cast_all_to_int(input_df):</span>
<span class="sd"> ... return input_df.select([col(col_name).cast(&quot;int&quot;) for col_name in input_df.columns])</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; def sort_columns_asc(input_df):</span>
<span class="sd"> ... return input_df.select(*sorted(input_df.columns))</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; df.transform(cast_all_to_int).transform(sort_columns_asc).show()</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> |float|int|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> | 1| 1|</span>
<span class="sd"> | 2| 2|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> &gt;&gt;&gt; def add_n(input_df, n):</span>
<span class="sd"> ... return input_df.select([(col(col_name) + n).alias(col_name)</span>
<span class="sd"> ... for col_name in input_df.columns])</span>
<span class="sd"> &gt;&gt;&gt; df.transform(add_n, 1).transform(add_n, n=10).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |int|float|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 12| 12.0|</span>
<span class="sd"> | 13| 13.0|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.sameSemantics"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.sameSemantics.html#pyspark.sql.DataFrame.sameSemantics">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">sameSemantics</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">bool</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns `True` when the logical query plans inside both :class:`DataFrame`\\s are equal and</span>
<span class="sd"> therefore return the same results.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.5.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> The equality comparison here is simplified by tolerating the cosmetic differences</span>
<span class="sd"> such as attribute names.</span>
<span class="sd"> This API can compare both :class:`DataFrame`\\s very fast but can still return</span>
<span class="sd"> `False` on the :class:`DataFrame` that return the same results, for instance, from</span>
<span class="sd"> different plans. Such false negative semantic can be useful when caching as an example.</span>
<span class="sd"> This API is a developer API.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> other : :class:`DataFrame`</span>
<span class="sd"> The other DataFrame to compare against.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> bool</span>
<span class="sd"> Whether these two DataFrames are similar.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df1 = spark.range(10)</span>
<span class="sd"> &gt;&gt;&gt; df2 = spark.range(10)</span>
<span class="sd"> &gt;&gt;&gt; df1.withColumn(&quot;col1&quot;, df1.id * 2).sameSemantics(df2.withColumn(&quot;col1&quot;, df2.id * 2))</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; df1.withColumn(&quot;col1&quot;, df1.id * 2).sameSemantics(df2.withColumn(&quot;col1&quot;, df2.id + 2))</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; df1.withColumn(&quot;col1&quot;, df1.id * 2).sameSemantics(df2.withColumn(&quot;col0&quot;, df2.id * 2))</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.semanticHash"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.semanticHash.html#pyspark.sql.DataFrame.semanticHash">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">semanticHash</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">int</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a hash code of the logical query plan against this :class:`DataFrame`.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.5.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Unlike the standard hash code, the hash is calculated against the query plan</span>
<span class="sd"> simplified by tolerating the cosmetic differences such as attribute names.</span>
<span class="sd"> This API is a developer API.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> int</span>
<span class="sd"> Hash value.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.range(10).selectExpr(&quot;id as col0&quot;).semanticHash() # doctest: +SKIP</span>
<span class="sd"> 1855039936</span>
<span class="sd"> &gt;&gt;&gt; spark.range(10).selectExpr(&quot;id as col1&quot;).semanticHash() # doctest: +SKIP</span>
<span class="sd"> 1855039936</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.inputFiles"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.inputFiles.html#pyspark.sql.DataFrame.inputFiles">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">inputFiles</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a best-effort snapshot of the files that compose this :class:`DataFrame`.</span>
<span class="sd"> This method simply asks each constituent BaseRelation for its respective files and</span>
<span class="sd"> takes the union of all results. Depending on the source relations, this may not find</span>
<span class="sd"> all input files. Duplicates are removed.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> list</span>
<span class="sd"> List of file paths.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import tempfile</span>
<span class="sd"> &gt;&gt;&gt; with tempfile.TemporaryDirectory(prefix=&quot;inputFiles&quot;) as d:</span>
<span class="sd"> ... # Write a single-row DataFrame into a JSON file</span>
<span class="sd"> ... spark.createDataFrame(</span>
<span class="sd"> ... [{&quot;age&quot;: 100, &quot;name&quot;: &quot;Hyukjin Kwon&quot;}]</span>
<span class="sd"> ... ).repartition(1).write.json(d, mode=&quot;overwrite&quot;)</span>
<span class="sd"> ...</span>
<span class="sd"> ... # Read the JSON file as a DataFrame.</span>
<span class="sd"> ... df = spark.read.format(&quot;json&quot;).load(d)</span>
<span class="sd"> ...</span>
<span class="sd"> ... # Returns the number of input files.</span>
<span class="sd"> ... len(df.inputFiles())</span>
<span class="sd"> 1</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.where"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.where.html#pyspark.sql.DataFrame.where">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">where</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">condition</span><span class="p">:</span> <span class="s2">&quot;ColumnOrName&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> :func:`where` is an alias for :func:`filter`.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<span class="c1"># Two aliases below were added for pandas compatibility many years ago.</span>
<span class="c1"># There are too many differences compared to pandas and we cannot just</span>
<span class="c1"># make it &quot;compatible&quot; by adding aliases. Therefore, we stop adding such</span>
<span class="c1"># aliases as of Spark 3.0. Two methods below remain just</span>
<span class="c1"># for legacy users currently.</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">groupby</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrNameOrOrdinal&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GroupedData&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">groupby</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">__cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]])</span> <span class="o">-&gt;</span> <span class="s2">&quot;GroupedData&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span>
<span class="k">def</span> <span class="nf">groupby</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">&quot;ColumnOrNameOrOrdinal&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;GroupedData&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> :func:`groupby` is an alias for :func:`groupBy`.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrame.drop_duplicates"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.drop_duplicates.html#pyspark.sql.DataFrame.drop_duplicates">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">drop_duplicates</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> :func:`drop_duplicates` is an alias for :func:`dropDuplicates`.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.writeTo"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.writeTo.html#pyspark.sql.DataFrame.writeTo">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">writeTo</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">table</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrameWriterV2</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Create a write configuration builder for v2 sources.</span>
<span class="sd"> This builder is used to configure and execute write operations.</span>
<span class="sd"> For example, to append or create or replace existing tables.</span>
<span class="sd"> .. versionadded:: 3.1.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> table : str</span>
<span class="sd"> Target table name to write to.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrameWriterV2`</span>
<span class="sd"> DataFrameWriterV2 to use further to specify how to save the data</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.writeTo(&quot;catalog.db.table&quot;).append() # doctest: +SKIP</span>
<span class="sd"> &gt;&gt;&gt; df.writeTo( # doctest: +SKIP</span>
<span class="sd"> ... &quot;catalog.db.table&quot;</span>
<span class="sd"> ... ).partitionedBy(&quot;col&quot;).createOrReplace()</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.pandas_api"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.pandas_api.html#pyspark.sql.DataFrame.pandas_api">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">pandas_api</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PandasOnSparkDataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts the existing DataFrame into a pandas-on-Spark DataFrame.</span>
<span class="sd"> .. versionadded:: 3.2.0</span>
<span class="sd"> .. versionchanged:: 3.5.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> If a pandas-on-Spark DataFrame is converted to a Spark DataFrame and then back</span>
<span class="sd"> to pandas-on-Spark, it will lose the index information and the original index</span>
<span class="sd"> will be turned into a normal column.</span>
<span class="sd"> This is only available if Pandas is installed and available.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> index_col: str or list of str, optional</span>
<span class="sd"> Index column of table in Spark.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`PandasOnSparkDataFrame`</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.pandas.frame.DataFrame.to_spark</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame(</span>
<span class="sd"> ... [(14, &quot;Tom&quot;), (23, &quot;Alice&quot;), (16, &quot;Bob&quot;)], [&quot;age&quot;, &quot;name&quot;])</span>
<span class="sd"> &gt;&gt;&gt; df.pandas_api() # doctest: +SKIP</span>
<span class="sd"> age name</span>
<span class="sd"> 0 14 Tom</span>
<span class="sd"> 1 23 Alice</span>
<span class="sd"> 2 16 Bob</span>
<span class="sd"> We can specify the index columns.</span>
<span class="sd"> &gt;&gt;&gt; df.pandas_api(index_col=&quot;age&quot;) # doctest: +SKIP</span>
<span class="sd"> name</span>
<span class="sd"> age</span>
<span class="sd"> 14 Tom</span>
<span class="sd"> 23 Alice</span>
<span class="sd"> 16 Bob</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.mapInPandas"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.mapInPandas.html#pyspark.sql.DataFrame.mapInPandas">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">mapInPandas</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">func</span><span class="p">:</span> <span class="s2">&quot;PandasMapIterFunction&quot;</span><span class="p">,</span>
<span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">StructType</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span>
<span class="n">barrier</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">profile</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">ResourceProfile</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Maps an iterator of batches in the current :class:`DataFrame` using a Python native</span>
<span class="sd"> function that is performed on pandas DataFrames both as input and output,</span>
<span class="sd"> and returns the result as a :class:`DataFrame`.</span>
<span class="sd"> This method applies the specified Python function to an iterator of</span>
<span class="sd"> `pandas.DataFrame`\\s, each representing a batch of rows from the original DataFrame.</span>
<span class="sd"> The returned iterator of `pandas.DataFrame`\\s are combined as a :class:`DataFrame`.</span>
<span class="sd"> The size of the function&#39;s input and output can be different. Each `pandas.DataFrame`</span>
<span class="sd"> size can be controlled by `spark.sql.execution.arrow.maxRecordsPerBatch`.</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> func : function</span>
<span class="sd"> a Python native function that takes an iterator of `pandas.DataFrame`\\s, and</span>
<span class="sd"> outputs an iterator of `pandas.DataFrame`\\s.</span>
<span class="sd"> schema : :class:`pyspark.sql.types.DataType` or str</span>
<span class="sd"> the return type of the `func` in PySpark. The value can be either a</span>
<span class="sd"> :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.</span>
<span class="sd"> barrier : bool, optional, default False</span>
<span class="sd"> Use barrier mode execution, ensuring that all Python workers in the stage will be</span>
<span class="sd"> launched concurrently.</span>
<span class="sd"> .. versionadded: 3.5.0</span>
<span class="sd"> profile : :class:`pyspark.resource.ResourceProfile`. The optional ResourceProfile</span>
<span class="sd"> to be used for mapInPandas.</span>
<span class="sd"> .. versionadded: 4.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, 21), (2, 30)], (&quot;id&quot;, &quot;age&quot;))</span>
<span class="sd"> Filter rows with id equal to 1:</span>
<span class="sd"> &gt;&gt;&gt; def filter_func(iterator):</span>
<span class="sd"> ... for pdf in iterator:</span>
<span class="sd"> ... yield pdf[pdf.id == 1]</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; df.mapInPandas(filter_func, df.schema).show() # doctest: +SKIP</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | id|age|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | 1| 21|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> Compute the mean age for each id:</span>
<span class="sd"> &gt;&gt;&gt; def mean_age(iterator):</span>
<span class="sd"> ... for pdf in iterator:</span>
<span class="sd"> ... yield pdf.groupby(&quot;id&quot;).mean().reset_index()</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; df.mapInPandas(mean_age, &quot;id: bigint, age: double&quot;).show() # doctest: +SKIP</span>
<span class="sd"> +---+----+</span>
<span class="sd"> | id| age|</span>
<span class="sd"> +---+----+</span>
<span class="sd"> | 1|21.0|</span>
<span class="sd"> | 2|30.0|</span>
<span class="sd"> +---+----+</span>
<span class="sd"> Add a new column with the double of the age:</span>
<span class="sd"> &gt;&gt;&gt; def double_age(iterator):</span>
<span class="sd"> ... for pdf in iterator:</span>
<span class="sd"> ... pdf[&quot;double_age&quot;] = pdf[&quot;age&quot;] * 2</span>
<span class="sd"> ... yield pdf</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; df.mapInPandas(</span>
<span class="sd"> ... double_age, &quot;id: bigint, age: bigint, double_age: bigint&quot;).show() # doctest: +SKIP</span>
<span class="sd"> +---+---+----------+</span>
<span class="sd"> | id|age|double_age|</span>
<span class="sd"> +---+---+----------+</span>
<span class="sd"> | 1| 21| 42|</span>
<span class="sd"> | 2| 30| 60|</span>
<span class="sd"> +---+---+----------+</span>
<span class="sd"> Set ``barrier`` to ``True`` to force the ``mapInPandas`` stage running in the</span>
<span class="sd"> barrier mode, it ensures all Python workers in the stage will be</span>
<span class="sd"> launched concurrently.</span>
<span class="sd"> &gt;&gt;&gt; df.mapInPandas(filter_func, df.schema, barrier=True).show() # doctest: +SKIP</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | id|age|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | 1| 21|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This API is experimental</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.sql.functions.pandas_udf</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.mapInArrow"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.mapInArrow.html#pyspark.sql.DataFrame.mapInArrow">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">mapInArrow</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">func</span><span class="p">:</span> <span class="s2">&quot;ArrowMapIterFunction&quot;</span><span class="p">,</span>
<span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">StructType</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span>
<span class="n">barrier</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span>
<span class="n">profile</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">ResourceProfile</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Maps an iterator of batches in the current :class:`DataFrame` using a Python native</span>
<span class="sd"> function that is performed on `pyarrow.RecordBatch`\\s both as input and output,</span>
<span class="sd"> and returns the result as a :class:`DataFrame`.</span>
<span class="sd"> This method applies the specified Python function to an iterator of</span>
<span class="sd"> `pyarrow.RecordBatch`\\s, each representing a batch of rows from the original DataFrame.</span>
<span class="sd"> The returned iterator of `pyarrow.RecordBatch`\\s are combined as a :class:`DataFrame`.</span>
<span class="sd"> The size of the function&#39;s input and output can be different. Each `pyarrow.RecordBatch`</span>
<span class="sd"> size can be controlled by `spark.sql.execution.arrow.maxRecordsPerBatch`.</span>
<span class="sd"> .. versionadded:: 3.3.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> func : function</span>
<span class="sd"> a Python native function that takes an iterator of `pyarrow.RecordBatch`\\s, and</span>
<span class="sd"> outputs an iterator of `pyarrow.RecordBatch`\\s.</span>
<span class="sd"> schema : :class:`pyspark.sql.types.DataType` or str</span>
<span class="sd"> the return type of the `func` in PySpark. The value can be either a</span>
<span class="sd"> :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.</span>
<span class="sd"> barrier : bool, optional, default False</span>
<span class="sd"> Use barrier mode execution, ensuring that all Python workers in the stage will be</span>
<span class="sd"> launched concurrently.</span>
<span class="sd"> .. versionadded: 3.5.0</span>
<span class="sd"> profile : :class:`pyspark.resource.ResourceProfile`. The optional ResourceProfile</span>
<span class="sd"> to be used for mapInArrow.</span>
<span class="sd"> .. versionadded: 4.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; import pyarrow # doctest: +SKIP</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([(1, 21), (2, 30)], (&quot;id&quot;, &quot;age&quot;))</span>
<span class="sd"> &gt;&gt;&gt; def filter_func(iterator):</span>
<span class="sd"> ... for batch in iterator:</span>
<span class="sd"> ... pdf = batch.to_pandas()</span>
<span class="sd"> ... yield pyarrow.RecordBatch.from_pandas(pdf[pdf.id == 1])</span>
<span class="sd"> &gt;&gt;&gt; df.mapInArrow(filter_func, df.schema).show() # doctest: +SKIP</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | id|age|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | 1| 21|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> Set ``barrier`` to ``True`` to force the ``mapInArrow`` stage running in the</span>
<span class="sd"> barrier mode, it ensures all Python workers in the stage will be</span>
<span class="sd"> launched concurrently.</span>
<span class="sd"> &gt;&gt;&gt; df.mapInArrow(filter_func, df.schema, barrier=True).show() # doctest: +SKIP</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | id|age|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | 1| 21|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This API is unstable, and for developers.</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> pyspark.sql.functions.pandas_udf</span>
<span class="sd"> pyspark.sql.DataFrame.mapInPandas</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.toArrow"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.toArrow.html#pyspark.sql.DataFrame.toArrow">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">toArrow</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;pa.Table&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the contents of this :class:`DataFrame` as PyArrow ``pyarrow.Table``.</span>
<span class="sd"> This is only available if PyArrow is installed and available.</span>
<span class="sd"> .. versionadded:: 4.0.0</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This method should only be used if the resulting PyArrow ``pyarrow.Table`` is</span>
<span class="sd"> expected to be small, as all the data is loaded into the driver&#39;s memory.</span>
<span class="sd"> This API is a developer API.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.toArrow() # doctest: +SKIP</span>
<span class="sd"> pyarrow.Table</span>
<span class="sd"> age: int64</span>
<span class="sd"> name: string</span>
<span class="sd"> ----</span>
<span class="sd"> age: [[2,5]]</span>
<span class="sd"> name: [[&quot;Alice&quot;,&quot;Bob&quot;]]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div>
<div class="viewcode-block" id="DataFrame.toPandas"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.toPandas.html#pyspark.sql.DataFrame.toPandas">[docs]</a> <span class="k">def</span> <span class="nf">toPandas</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;PandasDataFrameLike&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``.</span>
<span class="sd"> This is only available if Pandas is installed and available.</span>
<span class="sd"> .. versionadded:: 1.3.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This method should only be used if the resulting Pandas ``pandas.DataFrame`` is</span>
<span class="sd"> expected to be small, as all the data is loaded into the driver&#39;s memory.</span>
<span class="sd"> Usage with ``spark.sql.execution.arrow.pyspark.enabled=True`` is experimental.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; df.toPandas() # doctest: +SKIP</span>
<span class="sd"> age name</span>
<span class="sd"> 0 2 Alice</span>
<span class="sd"> 1 5 Bob</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="o">...</span></div></div>
<div class="viewcode-block" id="DataFrameNaFunctions"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameNaFunctions.html#pyspark.sql.DataFrameNaFunctions">[docs]</a><span class="k">class</span> <span class="nc">DataFrameNaFunctions</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Functionality for working with missing data in :class:`DataFrame`.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">df</span> <span class="o">=</span> <span class="n">df</span>
<div class="viewcode-block" id="DataFrameNaFunctions.drop"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameNaFunctions.drop.html#pyspark.sql.DataFrameNaFunctions.drop">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">drop</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">how</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">&quot;any&quot;</span><span class="p">,</span>
<span class="n">thresh</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="o">...</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span></div>
<span class="n">drop</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">dropna</span><span class="o">.</span><span class="vm">__doc__</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">fill</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="s2">&quot;LiteralType&quot;</span><span class="p">,</span> <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">fill</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="s2">&quot;LiteralType&quot;</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrameNaFunctions.fill"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameNaFunctions.fill.html#pyspark.sql.DataFrameNaFunctions.fill">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">fill</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">value</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;LiteralType&quot;</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="s2">&quot;LiteralType&quot;</span><span class="p">]],</span>
<span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span></div>
<span class="n">fill</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">fillna</span><span class="o">.</span><span class="vm">__doc__</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">replace</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">to_replace</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;LiteralType&quot;</span><span class="p">],</span>
<span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;OptionalPrimitiveType&quot;</span><span class="p">],</span>
<span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">replace</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">to_replace</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="s2">&quot;LiteralType&quot;</span><span class="p">,</span> <span class="s2">&quot;OptionalPrimitiveType&quot;</span><span class="p">],</span>
<span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">replace</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">to_replace</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;LiteralType&quot;</span><span class="p">],</span>
<span class="n">value</span><span class="p">:</span> <span class="s2">&quot;OptionalPrimitiveType&quot;</span><span class="p">,</span>
<span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrameNaFunctions.replace"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameNaFunctions.replace.html#pyspark.sql.DataFrameNaFunctions.replace">[docs]</a> <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span>
<span class="k">def</span> <span class="nf">replace</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">to_replace</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">&quot;LiteralType&quot;</span><span class="p">],</span> <span class="n">Dict</span><span class="p">[</span><span class="s2">&quot;LiteralType&quot;</span><span class="p">,</span> <span class="s2">&quot;OptionalPrimitiveType&quot;</span><span class="p">]],</span>
<span class="n">value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span>
<span class="n">Union</span><span class="p">[</span><span class="s2">&quot;OptionalPrimitiveType&quot;</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="s2">&quot;OptionalPrimitiveType&quot;</span><span class="p">],</span> <span class="n">_NoValueType</span><span class="p">]</span>
<span class="p">]</span> <span class="o">=</span> <span class="n">_NoValue</span><span class="p">,</span>
<span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span></div>
<span class="n">replace</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">replace</span><span class="o">.</span><span class="vm">__doc__</span></div>
<div class="viewcode-block" id="DataFrameStatFunctions"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameStatFunctions.html#pyspark.sql.DataFrameStatFunctions">[docs]</a><span class="k">class</span> <span class="nc">DataFrameStatFunctions</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Functionality for statistic functions with :class:`DataFrame`.</span>
<span class="sd"> .. versionadded:: 1.4.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">df</span> <span class="o">=</span> <span class="n">df</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">approxQuantile</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">col</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span>
<span class="n">probabilities</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">float</span><span class="p">]],</span>
<span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">approxQuantile</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">col</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">]],</span>
<span class="n">probabilities</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">float</span><span class="p">]],</span>
<span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]:</span>
<span class="o">...</span>
<div class="viewcode-block" id="DataFrameStatFunctions.approxQuantile"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameStatFunctions.approxQuantile.html#pyspark.sql.DataFrameStatFunctions.approxQuantile">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">approxQuantile</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">col</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">]],</span>
<span class="n">probabilities</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">float</span><span class="p">]],</span>
<span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]]:</span>
<span class="o">...</span></div>
<span class="n">approxQuantile</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">approxQuantile</span><span class="o">.</span><span class="vm">__doc__</span>
<div class="viewcode-block" id="DataFrameStatFunctions.corr"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameStatFunctions.corr.html#pyspark.sql.DataFrameStatFunctions.corr">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">corr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col1</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">method</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="o">...</span></div>
<span class="n">corr</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">corr</span><span class="o">.</span><span class="vm">__doc__</span>
<div class="viewcode-block" id="DataFrameStatFunctions.cov"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameStatFunctions.cov.html#pyspark.sql.DataFrameStatFunctions.cov">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">cov</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col1</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="o">...</span></div>
<span class="n">cov</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">cov</span><span class="o">.</span><span class="vm">__doc__</span>
<div class="viewcode-block" id="DataFrameStatFunctions.crosstab"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameStatFunctions.crosstab.html#pyspark.sql.DataFrameStatFunctions.crosstab">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">crosstab</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col1</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span></div>
<span class="n">crosstab</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">crosstab</span><span class="o">.</span><span class="vm">__doc__</span>
<div class="viewcode-block" id="DataFrameStatFunctions.freqItems"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameStatFunctions.freqItems.html#pyspark.sql.DataFrameStatFunctions.freqItems">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">freqItems</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">cols</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">support</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span></div>
<span class="n">freqItems</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">freqItems</span><span class="o">.</span><span class="vm">__doc__</span>
<div class="viewcode-block" id="DataFrameStatFunctions.sampleBy"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameStatFunctions.sampleBy.html#pyspark.sql.DataFrameStatFunctions.sampleBy">[docs]</a> <span class="nd">@dispatch_df_method</span>
<span class="k">def</span> <span class="nf">sampleBy</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">col</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">fractions</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="nb">float</span><span class="p">],</span> <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span></div>
<span class="n">sampleBy</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">sampleBy</span><span class="o">.</span><span class="vm">__doc__</span></div>
</pre></div>
</article>
<footer class="bd-footer-article">
<div class="footer-article-items footer-article__inner">
<div class="footer-article-item"><!-- Previous / next buttons -->
<div class="prev-next-area">
</div></div>
</div>
</footer>
</div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script>
<script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item"><p class="copyright">
Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
</p></div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item"><p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3.
</p></div>
</div>
</div>
</footer>
</body>
</html>