blob: 8677982a0828b20dd57d1e7b00116482818ebe22 [file] [log] [blame]
<!DOCTYPE html>
<html >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>pyspark.sql.session &#8212; PySpark 4.0.0-preview2 documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../../../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../../../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52" />
<link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" />
<script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script>
<script src="../../../_static/jquery.js"></script>
<script src="../../../_static/underscore.js"></script>
<script src="../../../_static/doctools.js"></script>
<script src="../../../_static/clipboard.min.js"></script>
<script src="../../../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = '_modules/pyspark/sql/session';</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/sql/session.html" />
<link rel="search" title="Search" href="../../../search.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a class="skip-link" href="#main-content">Skip to main content</a>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../../../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<nav class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../../../index.html">
<img src="https://spark.apache.org/images/spark-logo.png" class="logo__image only-light" alt="Logo image"/>
<script>document.write(`<img src="https://spark.apache.org/images/spark-logo-rev.svg" class="logo__image only-dark" alt="Logo image"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview2
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/sql/session.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
</div>
</nav>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar hide-on-wide">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../../../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview2
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "_modules/pyspark/sql/session.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumbs">
<ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb">
<li class="breadcrumb-item breadcrumb-home">
<a href="../../../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="../../index.html" class="nav-link">Module code</a></li>
<li class="breadcrumb-item active" aria-current="page">pyspark.sql.session</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article" role="main">
<h1>Source code for pyspark.sql.session</h1><div class="highlight"><pre>
<span></span><span class="c1">#</span>
<span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span>
<span class="c1"># contributor license agreements. See the NOTICE file distributed with</span>
<span class="c1"># this work for additional information regarding copyright ownership.</span>
<span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span>
<span class="c1"># (the &quot;License&quot;); you may not use this file except in compliance with</span>
<span class="c1"># the License. You may obtain a copy of the License at</span>
<span class="c1">#</span>
<span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span>
<span class="c1">#</span>
<span class="c1"># Unless required by applicable law or agreed to in writing, software</span>
<span class="c1"># distributed under the License is distributed on an &quot;AS IS&quot; BASIS,</span>
<span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span>
<span class="c1"># See the License for the specific language governing permissions and</span>
<span class="c1"># limitations under the License.</span>
<span class="c1">#</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">sys</span>
<span class="kn">import</span> <span class="nn">warnings</span>
<span class="kn">from</span> <span class="nn">collections.abc</span> <span class="kn">import</span> <span class="n">Sized</span>
<span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">reduce</span>
<span class="kn">from</span> <span class="nn">threading</span> <span class="kn">import</span> <span class="n">RLock</span>
<span class="kn">from</span> <span class="nn">types</span> <span class="kn">import</span> <span class="n">TracebackType</span>
<span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">Any</span><span class="p">,</span>
<span class="n">ClassVar</span><span class="p">,</span>
<span class="n">Dict</span><span class="p">,</span>
<span class="n">Iterable</span><span class="p">,</span>
<span class="n">List</span><span class="p">,</span>
<span class="n">Optional</span><span class="p">,</span>
<span class="n">Tuple</span><span class="p">,</span>
<span class="n">Type</span><span class="p">,</span>
<span class="n">Union</span><span class="p">,</span>
<span class="n">Set</span><span class="p">,</span>
<span class="n">cast</span><span class="p">,</span>
<span class="n">no_type_check</span><span class="p">,</span>
<span class="n">overload</span><span class="p">,</span>
<span class="n">TYPE_CHECKING</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.conf</span> <span class="kn">import</span> <span class="n">SparkConf</span>
<span class="kn">from</span> <span class="nn">pyspark.util</span> <span class="kn">import</span> <span class="n">is_remote_only</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.conf</span> <span class="kn">import</span> <span class="n">RuntimeConfig</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.dataframe</span> <span class="kn">import</span> <span class="n">DataFrame</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">lit</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.pandas.conversion</span> <span class="kn">import</span> <span class="n">SparkConversionMixin</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.profiler</span> <span class="kn">import</span> <span class="n">AccumulatorProfilerCollector</span><span class="p">,</span> <span class="n">Profile</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.readwriter</span> <span class="kn">import</span> <span class="n">DataFrameReader</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.sql_formatter</span> <span class="kn">import</span> <span class="n">SQLStringFormatter</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.streaming</span> <span class="kn">import</span> <span class="n">DataStreamReader</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="p">(</span>
<span class="n">AtomicType</span><span class="p">,</span>
<span class="n">DataType</span><span class="p">,</span>
<span class="n">StructField</span><span class="p">,</span>
<span class="n">StructType</span><span class="p">,</span>
<span class="n">_make_type_verifier</span><span class="p">,</span>
<span class="n">_infer_schema</span><span class="p">,</span>
<span class="n">_has_nulltype</span><span class="p">,</span>
<span class="n">_merge_type</span><span class="p">,</span>
<span class="n">_create_converter</span><span class="p">,</span>
<span class="n">_parse_datatype_string</span><span class="p">,</span>
<span class="n">_from_numpy_type</span><span class="p">,</span>
<span class="p">)</span>
<span class="kn">from</span> <span class="nn">pyspark.errors.exceptions.captured</span> <span class="kn">import</span> <span class="n">install_exception_handler</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.utils</span> <span class="kn">import</span> <span class="n">is_timestamp_ntz_preferred</span><span class="p">,</span> <span class="n">to_str</span><span class="p">,</span> <span class="n">try_remote_session_classmethod</span>
<span class="kn">from</span> <span class="nn">pyspark.errors</span> <span class="kn">import</span> <span class="n">PySparkValueError</span><span class="p">,</span> <span class="n">PySparkTypeError</span><span class="p">,</span> <span class="n">PySparkRuntimeError</span>
<span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">py4j.java_gateway</span> <span class="kn">import</span> <span class="n">JavaObject</span>
<span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span>
<span class="kn">from</span> <span class="nn">pyspark.core.context</span> <span class="kn">import</span> <span class="n">SparkContext</span>
<span class="kn">from</span> <span class="nn">pyspark.core.rdd</span> <span class="kn">import</span> <span class="n">RDD</span>
<span class="kn">from</span> <span class="nn">pyspark.sql._typing</span> <span class="kn">import</span> <span class="n">AtomicValue</span><span class="p">,</span> <span class="n">RowLike</span><span class="p">,</span> <span class="n">OptionalPrimitiveType</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.catalog</span> <span class="kn">import</span> <span class="n">Catalog</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.pandas._typing</span> <span class="kn">import</span> <span class="n">ArrayLike</span><span class="p">,</span> <span class="n">DataFrameLike</span> <span class="k">as</span> <span class="n">PandasDataFrameLike</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.streaming</span> <span class="kn">import</span> <span class="n">StreamingQueryManager</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.udf</span> <span class="kn">import</span> <span class="n">UDFRegistration</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.udtf</span> <span class="kn">import</span> <span class="n">UDTFRegistration</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.datasource</span> <span class="kn">import</span> <span class="n">DataSourceRegistration</span>
<span class="c1"># Running MyPy type checks will always require pandas and</span>
<span class="c1"># other dependencies so importing here is fine.</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.connect.client</span> <span class="kn">import</span> <span class="n">SparkConnectClient</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.connect.shell.progress</span> <span class="kn">import</span> <span class="n">ProgressHandler</span>
<span class="k">try</span><span class="p">:</span>
<span class="kn">import</span> <span class="nn">memory_profiler</span> <span class="c1"># noqa: F401</span>
<span class="n">has_memory_profiler</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">except</span> <span class="ne">Exception</span><span class="p">:</span>
<span class="n">has_memory_profiler</span> <span class="o">=</span> <span class="kc">False</span>
<span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;SparkSession&quot;</span><span class="p">]</span>
<span class="k">def</span> <span class="nf">_monkey_patch_RDD</span><span class="p">(</span><span class="n">sparkSession</span><span class="p">:</span> <span class="s2">&quot;SparkSession&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="nd">@no_type_check</span>
<span class="k">def</span> <span class="nf">toDF</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">sampleRatio</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Converts current :class:`RDD` into a :class:`DataFrame`</span>
<span class="sd"> This is a shorthand for ``spark.createDataFrame(rdd, schema, sampleRatio)``</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> schema : :class:`pyspark.sql.types.DataType`, str or list, optional</span>
<span class="sd"> a :class:`pyspark.sql.types.DataType` or a datatype string or a list of</span>
<span class="sd"> column names, default is None. The data type string format equals to</span>
<span class="sd"> :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can</span>
<span class="sd"> omit the ``struct&lt;&gt;`` and atomic types use ``typeName()`` as their format, e.g. use</span>
<span class="sd"> ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`.</span>
<span class="sd"> We can also use ``int`` as a short name for :class:`pyspark.sql.types.IntegerType`.</span>
<span class="sd"> sampleRatio : float, optional</span>
<span class="sd"> the sample ratio of rows used for inferring</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; rdd = spark.range(1).rdd.map(lambda x: tuple(x))</span>
<span class="sd"> &gt;&gt;&gt; rdd.collect()</span>
<span class="sd"> [(0,)]</span>
<span class="sd"> &gt;&gt;&gt; rdd.toDF().show()</span>
<span class="sd"> +---+</span>
<span class="sd"> | _1|</span>
<span class="sd"> +---+</span>
<span class="sd"> | 0|</span>
<span class="sd"> +---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">sparkSession</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">schema</span><span class="p">,</span> <span class="n">sampleRatio</span><span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_remote_only</span><span class="p">():</span>
<span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">RDD</span>
<span class="n">RDD</span><span class="o">.</span><span class="n">toDF</span> <span class="o">=</span> <span class="n">toDF</span> <span class="c1"># type: ignore[method-assign]</span>
<span class="c1"># TODO(SPARK-38912): This method can be dropped once support for Python 3.8 is dropped</span>
<span class="c1"># In Python 3.9, the @property decorator has been made compatible with the</span>
<span class="c1"># @classmethod decorator (https://docs.python.org/3.9/library/functions.html#classmethod)</span>
<span class="c1">#</span>
<span class="c1"># @classmethod + @property is also affected by a bug in Python&#39;s docstring which was backported</span>
<span class="c1"># to Python 3.9.6 (https://github.com/python/cpython/pull/28838)</span>
<span class="c1">#</span>
<span class="c1"># Python 3.9 with MyPy complains about @classmethod + @property combination. We should fix</span>
<span class="c1"># it together with MyPy.</span>
<span class="k">class</span> <span class="nc">classproperty</span><span class="p">(</span><span class="nb">property</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Same as Python&#39;s @property decorator, but for class attributes.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; class Builder:</span>
<span class="sd"> ... def build(self):</span>
<span class="sd"> ... return MyClass()</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; class MyClass:</span>
<span class="sd"> ... @classproperty</span>
<span class="sd"> ... def builder(cls):</span>
<span class="sd"> ... print(&quot;instantiating new builder&quot;)</span>
<span class="sd"> ... return Builder()</span>
<span class="sd"> ...</span>
<span class="sd"> &gt;&gt;&gt; c1 = MyClass.builder</span>
<span class="sd"> instantiating new builder</span>
<span class="sd"> &gt;&gt;&gt; c2 = MyClass.builder</span>
<span class="sd"> instantiating new builder</span>
<span class="sd"> &gt;&gt;&gt; c1 == c2</span>
<span class="sd"> False</span>
<span class="sd"> &gt;&gt;&gt; isinstance(c1.build(), MyClass)</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">def</span> <span class="fm">__get__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">instance</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">owner</span><span class="p">:</span> <span class="n">Any</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SparkSession.Builder&quot;</span><span class="p">:</span>
<span class="c1"># The &quot;type: ignore&quot; below silences the following error from mypy:</span>
<span class="c1"># error: Argument 1 to &quot;classmethod&quot; has incompatible</span>
<span class="c1"># type &quot;Optional[Callable[[Any], Any]]&quot;;</span>
<span class="c1"># expected &quot;Callable[..., Any]&quot; [arg-type]</span>
<span class="k">return</span> <span class="nb">classmethod</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fget</span><span class="p">)</span><span class="o">.</span><span class="fm">__get__</span><span class="p">(</span><span class="kc">None</span><span class="p">,</span> <span class="n">owner</span><span class="p">)()</span> <span class="c1"># type: ignore</span>
<div class="viewcode-block" id="SparkSession"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.html#pyspark.sql.SparkSession">[docs]</a><span class="k">class</span> <span class="nc">SparkSession</span><span class="p">(</span><span class="n">SparkConversionMixin</span><span class="p">):</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;The entry point to programming Spark with the Dataset and DataFrame API.</span>
<span class="sd"> A SparkSession can be used to create :class:`DataFrame`, register :class:`DataFrame` as</span>
<span class="sd"> tables, execute SQL over tables, cache tables, and read parquet files.</span>
<span class="sd"> To create a :class:`SparkSession`, use the following builder pattern:</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> .. autoattribute:: builder</span>
<span class="sd"> :annotation:</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Create a Spark session.</span>
<span class="sd"> &gt;&gt;&gt; spark = (</span>
<span class="sd"> ... SparkSession.builder</span>
<span class="sd"> ... .master(&quot;local&quot;)</span>
<span class="sd"> ... .appName(&quot;Word Count&quot;)</span>
<span class="sd"> ... .config(&quot;spark.some.config.option&quot;, &quot;some-value&quot;)</span>
<span class="sd"> ... .getOrCreate()</span>
<span class="sd"> ... )</span>
<span class="sd"> Create a Spark session with Spark Connect.</span>
<span class="sd"> &gt;&gt;&gt; spark = (</span>
<span class="sd"> ... SparkSession.builder</span>
<span class="sd"> ... .remote(&quot;sc://localhost&quot;)</span>
<span class="sd"> ... .appName(&quot;Word Count&quot;)</span>
<span class="sd"> ... .config(&quot;spark.some.config.option&quot;, &quot;some-value&quot;)</span>
<span class="sd"> ... .getOrCreate()</span>
<span class="sd"> ... ) # doctest: +SKIP</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">class</span> <span class="nc">Builder</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Builder for :class:`SparkSession`.&quot;&quot;&quot;</span>
<span class="n">_lock</span> <span class="o">=</span> <span class="n">RLock</span><span class="p">()</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="p">{}</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">config</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">conf</span><span class="p">:</span> <span class="n">SparkConf</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SparkSession.Builder&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">config</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SparkSession.Builder&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">config</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="nb">map</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="s2">&quot;OptionalPrimitiveType&quot;</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="s2">&quot;SparkSession.Builder&quot;</span><span class="p">:</span>
<span class="o">...</span>
<span class="k">def</span> <span class="nf">config</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">key</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">conf</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">SparkConf</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="o">*</span><span class="p">,</span>
<span class="nb">map</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="s2">&quot;OptionalPrimitiveType&quot;</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SparkSession.Builder&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Sets a config option. Options set using this method are automatically propagated to</span>
<span class="sd"> both :class:`SparkConf` and :class:`SparkSession`&#39;s own configuration.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> key : str, optional</span>
<span class="sd"> a key name string for configuration property</span>
<span class="sd"> value : str, optional</span>
<span class="sd"> a value for configuration property</span>
<span class="sd"> conf : :class:`SparkConf`, optional</span>
<span class="sd"> an instance of :class:`SparkConf`</span>
<span class="sd"> map: dictionary, optional</span>
<span class="sd"> a dictionary of configurations to set</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`SparkSession.Builder`</span>
<span class="sd"> See Also</span>
<span class="sd"> --------</span>
<span class="sd"> :class:`SparkConf`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> For an existing :class:`SparkConf`, use `conf` parameter.</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.conf import SparkConf</span>
<span class="sd"> &gt;&gt;&gt; conf = SparkConf().setAppName(&quot;example&quot;).setMaster(&quot;local&quot;)</span>
<span class="sd"> &gt;&gt;&gt; SparkSession.builder.config(conf=conf)</span>
<span class="sd"> &lt;pyspark.sql.session.SparkSession.Builder...</span>
<span class="sd"> For a (key, value) pair, you can omit parameter names.</span>
<span class="sd"> &gt;&gt;&gt; SparkSession.builder.config(&quot;spark.some.config.option&quot;, &quot;some-value&quot;)</span>
<span class="sd"> &lt;pyspark.sql.session.SparkSession.Builder...</span>
<span class="sd"> Set multiple configurations.</span>
<span class="sd"> &gt;&gt;&gt; SparkSession.builder.config(</span>
<span class="sd"> ... &quot;spark.some.config.number&quot;, 123).config(&quot;spark.some.config.float&quot;, 0.123)</span>
<span class="sd"> &lt;pyspark.sql.session.SparkSession.Builder...</span>
<span class="sd"> Set multiple configurations using a dictionary.</span>
<span class="sd"> &gt;&gt;&gt; SparkSession.builder.config(</span>
<span class="sd"> ... map={&quot;spark.some.config.number&quot;: 123, &quot;spark.some.config.float&quot;: 0.123})</span>
<span class="sd"> &lt;pyspark.sql.session.SparkSession.Builder...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">with</span> <span class="bp">self</span><span class="o">.</span><span class="n">_lock</span><span class="p">:</span>
<span class="k">if</span> <span class="n">conf</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">conf</span><span class="o">.</span><span class="n">getAll</span><span class="p">():</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="n">v</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_validate_startup_urls</span><span class="p">()</span>
<span class="k">elif</span> <span class="nb">map</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="nb">map</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> <span class="c1"># type: ignore[assignment]</span>
<span class="n">v</span> <span class="o">=</span> <span class="n">to_str</span><span class="p">(</span><span class="n">v</span><span class="p">)</span> <span class="c1"># type: ignore[assignment]</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="n">v</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_validate_startup_urls</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">value</span> <span class="o">=</span> <span class="n">to_str</span><span class="p">(</span><span class="n">value</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">key</span><span class="p">)]</span> <span class="o">=</span> <span class="n">value</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_validate_startup_urls</span><span class="p">()</span>
<span class="k">return</span> <span class="bp">self</span>
<span class="k">def</span> <span class="nf">_validate_startup_urls</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Helper function that validates the combination of startup URLs and raises an exception</span>
<span class="sd"> if incompatible options are selected.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="p">(</span><span class="s2">&quot;spark.master&quot;</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_options</span> <span class="ow">or</span> <span class="s2">&quot;MASTER&quot;</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">)</span> <span class="ow">and</span> <span class="p">(</span>
<span class="s2">&quot;spark.remote&quot;</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_options</span> <span class="ow">or</span> <span class="s2">&quot;SPARK_REMOTE&quot;</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span>
<span class="p">):</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;CANNOT_CONFIGURE_SPARK_CONNECT_MASTER&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span>
<span class="s2">&quot;master_url&quot;</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;spark.master&quot;</span><span class="p">,</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;MASTER&quot;</span><span class="p">)),</span>
<span class="s2">&quot;connect_url&quot;</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span>
<span class="s2">&quot;spark.remote&quot;</span><span class="p">,</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;SPARK_REMOTE&quot;</span><span class="p">)</span>
<span class="p">),</span>
<span class="p">},</span>
<span class="p">)</span>
<span class="k">if</span> <span class="s2">&quot;spark.remote&quot;</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="p">:</span>
<span class="n">remote</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;spark.remote&quot;</span><span class="p">))</span>
<span class="k">if</span> <span class="p">(</span><span class="s2">&quot;SPARK_REMOTE&quot;</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span> <span class="ow">and</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;SPARK_REMOTE&quot;</span><span class="p">]</span> <span class="o">!=</span> <span class="n">remote</span><span class="p">)</span> <span class="ow">and</span> <span class="p">(</span>
<span class="s2">&quot;SPARK_LOCAL_REMOTE&quot;</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">remote</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">&quot;local&quot;</span><span class="p">)</span>
<span class="p">):</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;CANNOT_CONFIGURE_SPARK_CONNECT&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span>
<span class="s2">&quot;existing_url&quot;</span><span class="p">:</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;SPARK_REMOTE&quot;</span><span class="p">],</span>
<span class="s2">&quot;new_url&quot;</span><span class="p">:</span> <span class="n">remote</span><span class="p">,</span>
<span class="p">},</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">master</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">master</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SparkSession.Builder&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Sets the Spark master URL to connect to, such as &quot;local&quot; to run locally, &quot;local[4]&quot;</span>
<span class="sd"> to run locally with 4 cores, or &quot;spark://master:7077&quot; to run on a Spark standalone</span>
<span class="sd"> cluster.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> master : str</span>
<span class="sd"> a url for spark master</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`SparkSession.Builder`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; SparkSession.builder.master(&quot;local&quot;)</span>
<span class="sd"> &lt;pyspark.sql.session.SparkSession.Builder...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="s2">&quot;spark.master&quot;</span><span class="p">,</span> <span class="n">master</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">remote</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">url</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SparkSession.Builder&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Sets the Spark remote URL to connect to, such as &quot;sc://host:port&quot; to run</span>
<span class="sd"> it via Spark Connect server.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> url : str</span>
<span class="sd"> URL to Spark Connect server</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`SparkSession.Builder`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; SparkSession.builder.remote(&quot;sc://localhost&quot;) # doctest: +SKIP</span>
<span class="sd"> &lt;pyspark.sql.session.SparkSession.Builder...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="s2">&quot;spark.remote&quot;</span><span class="p">,</span> <span class="n">url</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">appName</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SparkSession.Builder&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Sets a name for the application, which will be shown in the Spark web UI.</span>
<span class="sd"> If no application name is set, a randomly generated name will be used.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> name : str</span>
<span class="sd"> an application name</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`SparkSession.Builder`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; SparkSession.builder.appName(&quot;My app&quot;)</span>
<span class="sd"> &lt;pyspark.sql.session.SparkSession.Builder...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="s2">&quot;spark.app.name&quot;</span><span class="p">,</span> <span class="n">name</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">enableHiveSupport</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SparkSession.Builder&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Enables Hive support, including connectivity to a persistent Hive metastore, support</span>
<span class="sd"> for Hive SerDes, and Hive user-defined functions.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`SparkSession.Builder`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; SparkSession.builder.enableHiveSupport()</span>
<span class="sd"> &lt;pyspark.sql.session.SparkSession.Builder...</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="s2">&quot;spark.sql.catalogImplementation&quot;</span><span class="p">,</span> <span class="s2">&quot;hive&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">getOrCreate</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SparkSession&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Gets an existing :class:`SparkSession` or, if there is no existing one, creates a</span>
<span class="sd"> new one based on the options set in this builder.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`SparkSession`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> This method first checks whether there is a valid global default SparkSession, and if</span>
<span class="sd"> yes, return that one. If no valid global default SparkSession exists, the method</span>
<span class="sd"> creates a new SparkSession and assigns the newly created SparkSession as the global</span>
<span class="sd"> default.</span>
<span class="sd"> &gt;&gt;&gt; s1 = SparkSession.builder.config(&quot;k1&quot;, &quot;v1&quot;).getOrCreate()</span>
<span class="sd"> &gt;&gt;&gt; s1.conf.get(&quot;k1&quot;) == &quot;v1&quot;</span>
<span class="sd"> True</span>
<span class="sd"> The configuration of the SparkSession can be changed afterwards</span>
<span class="sd"> &gt;&gt;&gt; s1.conf.set(&quot;k1&quot;, &quot;v1_new&quot;)</span>
<span class="sd"> &gt;&gt;&gt; s1.conf.get(&quot;k1&quot;) == &quot;v1_new&quot;</span>
<span class="sd"> True</span>
<span class="sd"> In case an existing SparkSession is returned, the config options specified</span>
<span class="sd"> in this builder will be applied to the existing SparkSession.</span>
<span class="sd"> &gt;&gt;&gt; s2 = SparkSession.builder.config(&quot;k2&quot;, &quot;v2&quot;).getOrCreate()</span>
<span class="sd"> &gt;&gt;&gt; s1.conf.get(&quot;k1&quot;) == s2.conf.get(&quot;k1&quot;) == &quot;v1_new&quot;</span>
<span class="sd"> True</span>
<span class="sd"> &gt;&gt;&gt; s1.conf.get(&quot;k2&quot;) == s2.conf.get(&quot;k2&quot;) == &quot;v2&quot;</span>
<span class="sd"> True</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">opts</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="p">)</span>
<span class="k">if</span> <span class="n">is_remote_only</span><span class="p">():</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.connect.session</span> <span class="kn">import</span> <span class="n">SparkSession</span> <span class="k">as</span> <span class="n">RemoteSparkSession</span>
<span class="n">url</span> <span class="o">=</span> <span class="n">opts</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;spark.remote&quot;</span><span class="p">,</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;SPARK_REMOTE&quot;</span><span class="p">))</span>
<span class="k">if</span> <span class="n">url</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;CONNECT_URL_NOT_SET&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{},</span>
<span class="p">)</span>
<span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;SPARK_CONNECT_MODE_ENABLED&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;1&quot;</span>
<span class="n">opts</span><span class="p">[</span><span class="s2">&quot;spark.remote&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">url</span>
<span class="k">return</span> <span class="n">RemoteSparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="nb">map</span><span class="o">=</span><span class="n">opts</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> <span class="c1"># type: ignore</span>
<span class="kn">from</span> <span class="nn">pyspark.core.context</span> <span class="kn">import</span> <span class="n">SparkContext</span>
<span class="k">with</span> <span class="bp">self</span><span class="o">.</span><span class="n">_lock</span><span class="p">:</span>
<span class="k">if</span> <span class="p">(</span>
<span class="s2">&quot;SPARK_CONNECT_MODE_ENABLED&quot;</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span>
<span class="ow">or</span> <span class="s2">&quot;SPARK_REMOTE&quot;</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span>
<span class="ow">or</span> <span class="s2">&quot;spark.remote&quot;</span> <span class="ow">in</span> <span class="n">opts</span>
<span class="p">):</span>
<span class="k">with</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_lock</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.connect.session</span> <span class="kn">import</span> <span class="n">SparkSession</span> <span class="k">as</span> <span class="n">RemoteSparkSession</span>
<span class="k">if</span> <span class="p">(</span>
<span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span> <span class="ow">is</span> <span class="kc">None</span>
<span class="ow">and</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">_instantiatedSession</span> <span class="ow">is</span> <span class="kc">None</span>
<span class="p">):</span>
<span class="n">url</span> <span class="o">=</span> <span class="n">opts</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;spark.remote&quot;</span><span class="p">,</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;SPARK_REMOTE&quot;</span><span class="p">))</span>
<span class="k">if</span> <span class="n">url</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;CONNECT_URL_NOT_SET&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{},</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">url</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">&quot;local&quot;</span><span class="p">):</span>
<span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;SPARK_LOCAL_REMOTE&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;1&quot;</span>
<span class="n">RemoteSparkSession</span><span class="o">.</span><span class="n">_start_connect_server</span><span class="p">(</span><span class="n">url</span><span class="p">,</span> <span class="n">opts</span><span class="p">)</span>
<span class="n">url</span> <span class="o">=</span> <span class="s2">&quot;sc://localhost&quot;</span>
<span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;SPARK_CONNECT_MODE_ENABLED&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;1&quot;</span>
<span class="n">opts</span><span class="p">[</span><span class="s2">&quot;spark.remote&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">url</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span>
<span class="n">SparkSession</span><span class="p">,</span>
<span class="n">RemoteSparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="nb">map</span><span class="o">=</span><span class="n">opts</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">(),</span>
<span class="p">)</span>
<span class="k">elif</span> <span class="s2">&quot;SPARK_LOCAL_REMOTE&quot;</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">:</span>
<span class="n">url</span> <span class="o">=</span> <span class="s2">&quot;sc://localhost&quot;</span>
<span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;SPARK_CONNECT_MODE_ENABLED&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;1&quot;</span>
<span class="n">opts</span><span class="p">[</span><span class="s2">&quot;spark.remote&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">url</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span>
<span class="n">SparkSession</span><span class="p">,</span>
<span class="n">RemoteSparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="nb">map</span><span class="o">=</span><span class="n">opts</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">(),</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;SESSION_ALREADY_EXIST&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{},</span>
<span class="p">)</span>
<span class="n">session</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">_instantiatedSession</span>
<span class="k">if</span> <span class="n">session</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="n">session</span><span class="o">.</span><span class="n">_sc</span><span class="o">.</span><span class="n">_jsc</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">sparkConf</span> <span class="o">=</span> <span class="n">SparkConf</span><span class="p">()</span>
<span class="k">for</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
<span class="n">sparkConf</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span>
<span class="c1"># This SparkContext may be an existing one.</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">(</span><span class="n">sparkConf</span><span class="p">)</span>
<span class="c1"># Do not update `SparkConf` for existing `SparkContext`, as it&#39;s shared</span>
<span class="c1"># by all sessions.</span>
<span class="n">session</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">options</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="nb">getattr</span><span class="p">(</span>
<span class="nb">getattr</span><span class="p">(</span><span class="n">session</span><span class="o">.</span><span class="n">_jvm</span><span class="p">,</span> <span class="s2">&quot;SparkSession$&quot;</span><span class="p">),</span> <span class="s2">&quot;MODULE$&quot;</span>
<span class="p">)</span><span class="o">.</span><span class="n">applyModifiableSettings</span><span class="p">(</span><span class="n">session</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="p">)</span>
<span class="k">return</span> <span class="n">session</span>
<span class="c1"># Spark Connect-specific API</span>
<span class="k">def</span> <span class="nf">create</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SparkSession&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Creates a new SparkSession. Can only be used in the context of Spark Connect</span>
<span class="sd"> and will throw an exception otherwise.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`SparkSession`</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This method will update the default and/or active session if they are not set.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">opts</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="p">)</span>
<span class="k">if</span> <span class="s2">&quot;SPARK_REMOTE&quot;</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span> <span class="ow">or</span> <span class="s2">&quot;spark.remote&quot;</span> <span class="ow">in</span> <span class="n">opts</span><span class="p">:</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.connect.session</span> <span class="kn">import</span> <span class="n">SparkSession</span> <span class="k">as</span> <span class="n">RemoteSparkSession</span>
<span class="c1"># Validate that no incompatible configuration options are selected.</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_validate_startup_urls</span><span class="p">()</span>
<span class="n">url</span> <span class="o">=</span> <span class="n">opts</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;spark.remote&quot;</span><span class="p">,</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;SPARK_REMOTE&quot;</span><span class="p">))</span>
<span class="k">if</span> <span class="n">url</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">&quot;local&quot;</span><span class="p">):</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;UNSUPPORTED_LOCAL_CONNECTION_STRING&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{},</span>
<span class="p">)</span>
<span class="c1"># Mark this Spark Session as Spark Connect. This prevents that local PySpark is</span>
<span class="c1"># used in conjunction with Spark Connect mode.</span>
<span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;SPARK_CONNECT_MODE_ENABLED&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;1&quot;</span>
<span class="n">opts</span><span class="p">[</span><span class="s2">&quot;spark.remote&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="n">url</span>
<span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">SparkSession</span><span class="p">,</span> <span class="n">RemoteSparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="nb">map</span><span class="o">=</span><span class="n">opts</span><span class="p">)</span><span class="o">.</span><span class="n">create</span><span class="p">())</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;ONLY_SUPPORTED_WITH_SPARK_CONNECT&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;SparkSession.builder.create&quot;</span><span class="p">},</span>
<span class="p">)</span>
<span class="c1"># TODO(SPARK-38912): Replace classproperty with @classmethod + @property once support for</span>
<span class="c1"># Python 3.8 is dropped.</span>
<span class="c1">#</span>
<span class="c1"># In Python 3.9, the @property decorator has been made compatible with the</span>
<span class="c1"># @classmethod decorator (https://docs.python.org/3.9/library/functions.html#classmethod)</span>
<span class="c1">#</span>
<span class="c1"># @classmethod + @property is also affected by a bug in Python&#39;s docstring which was backported</span>
<span class="c1"># to Python 3.9.6 (https://github.com/python/cpython/pull/28838)</span>
<span class="c1">#</span>
<span class="c1"># SPARK-47544: Explicitly declaring this as an identifier instead of a method.</span>
<span class="c1"># If changing, make sure this bug is not reintroduced.</span>
<span class="n">builder</span><span class="p">:</span> <span class="n">Builder</span> <span class="o">=</span> <span class="n">classproperty</span><span class="p">(</span><span class="k">lambda</span> <span class="bp">cls</span><span class="p">:</span> <span class="bp">cls</span><span class="o">.</span><span class="n">Builder</span><span class="p">())</span> <span class="c1"># type: ignore</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Creates a :class:`Builder` for constructing a :class:`SparkSession`.</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">_instantiatedSession</span><span class="p">:</span> <span class="n">ClassVar</span><span class="p">[</span><span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;SparkSession&quot;</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">_activeSession</span><span class="p">:</span> <span class="n">ClassVar</span><span class="p">[</span><span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;SparkSession&quot;</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">sparkContext</span><span class="p">:</span> <span class="s2">&quot;SparkContext&quot;</span><span class="p">,</span>
<span class="n">jsparkSession</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;JavaObject&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">options</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="p">{},</span>
<span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_sc</span> <span class="o">=</span> <span class="n">sparkContext</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_jsc</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="o">.</span><span class="n">_jsc</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="o">.</span><span class="n">_jvm</span>
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="k">if</span> <span class="n">jsparkSession</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">if</span> <span class="p">(</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">getDefaultSession</span><span class="p">()</span><span class="o">.</span><span class="n">isDefined</span><span class="p">()</span>
<span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">getDefaultSession</span><span class="p">()</span><span class="o">.</span><span class="n">get</span><span class="p">()</span><span class="o">.</span><span class="n">sparkContext</span><span class="p">()</span><span class="o">.</span><span class="n">isStopped</span><span class="p">()</span>
<span class="p">):</span>
<span class="n">jsparkSession</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">getDefaultSession</span><span class="p">()</span><span class="o">.</span><span class="n">get</span><span class="p">()</span>
<span class="nb">getattr</span><span class="p">(</span><span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="p">,</span> <span class="s2">&quot;SparkSession$&quot;</span><span class="p">),</span> <span class="s2">&quot;MODULE$&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">applyModifiableSettings</span><span class="p">(</span>
<span class="n">jsparkSession</span><span class="p">,</span> <span class="n">options</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">jsparkSession</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jsc</span><span class="o">.</span><span class="n">sc</span><span class="p">(),</span> <span class="n">options</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="nb">getattr</span><span class="p">(</span><span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="p">,</span> <span class="s2">&quot;SparkSession$&quot;</span><span class="p">),</span> <span class="s2">&quot;MODULE$&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">applyModifiableSettings</span><span class="p">(</span>
<span class="n">jsparkSession</span><span class="p">,</span> <span class="n">options</span>
<span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span> <span class="o">=</span> <span class="n">jsparkSession</span>
<span class="n">_monkey_patch_RDD</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="n">install_exception_handler</span><span class="p">()</span>
<span class="c1"># If we had an instantiated SparkSession attached with a SparkContext</span>
<span class="c1"># which is stopped now, we need to renew the instantiated SparkSession.</span>
<span class="c1"># Otherwise, we will use invalid SparkSession when we call Builder.getOrCreate.</span>
<span class="k">if</span> <span class="p">(</span>
<span class="n">SparkSession</span><span class="o">.</span><span class="n">_instantiatedSession</span> <span class="ow">is</span> <span class="kc">None</span>
<span class="ow">or</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">_instantiatedSession</span><span class="o">.</span><span class="n">_sc</span><span class="o">.</span><span class="n">_jsc</span> <span class="ow">is</span> <span class="kc">None</span>
<span class="p">):</span>
<span class="n">SparkSession</span><span class="o">.</span><span class="n">_instantiatedSession</span> <span class="o">=</span> <span class="bp">self</span>
<span class="n">SparkSession</span><span class="o">.</span><span class="n">_activeSession</span> <span class="o">=</span> <span class="bp">self</span>
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">setDefaultSession</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">setActiveSession</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_profiler_collector</span> <span class="o">=</span> <span class="n">AccumulatorProfilerCollector</span><span class="p">()</span>
<span class="k">def</span> <span class="nf">_repr_html_</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="k">return</span> <span class="s2">&quot;&quot;&quot;</span>
<span class="s2"> &lt;div&gt;</span>
<span class="s2"> &lt;p&gt;&lt;b&gt;SparkSession - </span><span class="si">{catalogImplementation}</span><span class="s2">&lt;/b&gt;&lt;/p&gt;</span>
<span class="s2"> </span><span class="si">{sc_HTML}</span>
<span class="s2"> &lt;/div&gt;</span>
<span class="s2"> &quot;&quot;&quot;</span><span class="o">.</span><span class="n">format</span><span class="p">(</span>
<span class="n">catalogImplementation</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">conf</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;spark.sql.catalogImplementation&quot;</span><span class="p">),</span>
<span class="n">sc_HTML</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">sparkContext</span><span class="o">.</span><span class="n">_repr_html_</span><span class="p">(),</span>
<span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">_jconf</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;JavaObject&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Accessor for the JVM SQL-specific configurations&quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">sessionState</span><span class="p">()</span><span class="o">.</span><span class="n">conf</span><span class="p">()</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_remote_only</span><span class="p">():</span>
<div class="viewcode-block" id="SparkSession.newSession"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.newSession.html#pyspark.sql.SparkSession.newSession">[docs]</a> <span class="k">def</span> <span class="nf">newSession</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SparkSession&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a new :class:`SparkSession` as new session, that has separate SQLConf,</span>
<span class="sd"> registered temporary views and UDFs, but shared :class:`SparkContext` and</span>
<span class="sd"> table cache.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`SparkSession`</span>
<span class="sd"> Spark session if an active session exists for the current thread</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.newSession()</span>
<span class="sd"> &lt;...SparkSession object ...&gt;</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">newSession</span><span class="p">())</span></div>
<div class="viewcode-block" id="SparkSession.getActiveSession"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.getActiveSession.html#pyspark.sql.SparkSession.getActiveSession">[docs]</a> <span class="nd">@classmethod</span>
<span class="nd">@try_remote_session_classmethod</span>
<span class="k">def</span> <span class="nf">getActiveSession</span><span class="p">(</span><span class="bp">cls</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">&quot;SparkSession&quot;</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the active :class:`SparkSession` for the current thread, returned by the builder</span>
<span class="sd"> .. versionadded:: 3.0.0</span>
<span class="sd"> .. versionchanged:: 3.5.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`SparkSession`</span>
<span class="sd"> Spark session if an active session exists for the current thread</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; s = SparkSession.getActiveSession()</span>
<span class="sd"> &gt;&gt;&gt; df = s.createDataFrame([(&#39;Alice&#39;, 1)], [&#39;name&#39;, &#39;age&#39;])</span>
<span class="sd"> &gt;&gt;&gt; df.select(&quot;age&quot;).show()</span>
<span class="sd"> +---+</span>
<span class="sd"> |age|</span>
<span class="sd"> +---+</span>
<span class="sd"> | 1|</span>
<span class="sd"> +---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">SparkContext</span>
<span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span>
<span class="k">if</span> <span class="n">sc</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">None</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">assert</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="k">if</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">getActiveSession</span><span class="p">()</span><span class="o">.</span><span class="n">isDefined</span><span class="p">():</span>
<span class="n">SparkSession</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">getActiveSession</span><span class="p">()</span><span class="o">.</span><span class="n">get</span><span class="p">())</span>
<span class="k">return</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">_activeSession</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="kc">None</span></div>
<div class="viewcode-block" id="SparkSession.active"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.active.html#pyspark.sql.SparkSession.active">[docs]</a> <span class="nd">@classmethod</span>
<span class="nd">@try_remote_session_classmethod</span>
<span class="k">def</span> <span class="nf">active</span><span class="p">(</span><span class="bp">cls</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SparkSession&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the active or default :class:`SparkSession` for the current thread, returned by</span>
<span class="sd"> the builder.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`SparkSession`</span>
<span class="sd"> Spark session if an active or default session exists for the current thread.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">session</span> <span class="o">=</span> <span class="bp">cls</span><span class="o">.</span><span class="n">getActiveSession</span><span class="p">()</span>
<span class="k">if</span> <span class="n">session</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">session</span> <span class="o">=</span> <span class="bp">cls</span><span class="o">.</span><span class="n">_instantiatedSession</span>
<span class="k">if</span> <span class="n">session</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;NO_ACTIVE_OR_DEFAULT_SESSION&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{},</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">session</span></div>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_remote_only</span><span class="p">():</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">sparkContext</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SparkContext&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the underlying :class:`SparkContext`.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`SparkContext`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.sparkContext</span>
<span class="sd"> &lt;SparkContext master=... appName=...&gt;</span>
<span class="sd"> Create an RDD from the Spark context</span>
<span class="sd"> &gt;&gt;&gt; rdd = spark.sparkContext.parallelize([1, 2, 3])</span>
<span class="sd"> &gt;&gt;&gt; rdd.collect()</span>
<span class="sd"> [1, 2, 3]</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sc</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">version</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">str</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> The version of Spark on which this application is running.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> str</span>
<span class="sd"> the version of Spark in string.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; _ = spark.version</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">version</span><span class="p">()</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">conf</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">RuntimeConfig</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Runtime configuration interface for Spark.</span>
<span class="sd"> This is the interface through which the user can get and set all Spark and Hadoop</span>
<span class="sd"> configurations that are relevant to Spark SQL. When getting the value of a config,</span>
<span class="sd"> this defaults to the value set in the underlying :class:`SparkContext`, if any.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`pyspark.sql.conf.RuntimeConfig`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.conf</span>
<span class="sd"> &lt;pyspark...RuntimeConf...&gt;</span>
<span class="sd"> Set a runtime configuration for the session</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.set(&quot;key&quot;, &quot;value&quot;)</span>
<span class="sd"> &gt;&gt;&gt; spark.conf.get(&quot;key&quot;)</span>
<span class="sd"> &#39;value&#39;</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">&quot;_conf&quot;</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_conf</span> <span class="o">=</span> <span class="n">RuntimeConfig</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">conf</span><span class="p">())</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_conf</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">catalog</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;Catalog&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Interface through which the user may create, drop, alter or query underlying</span>
<span class="sd"> databases, tables, functions, etc.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`Catalog`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.catalog</span>
<span class="sd"> &lt;...Catalog object ...&gt;</span>
<span class="sd"> Create a temp view, show the list, and drop it.</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1).createTempView(&quot;test_view&quot;)</span>
<span class="sd"> &gt;&gt;&gt; spark.catalog.listTables() # doctest: +SKIP</span>
<span class="sd"> [Table(name=&#39;test_view&#39;, catalog=None, namespace=[], description=None, ...</span>
<span class="sd"> &gt;&gt;&gt; _ = spark.catalog.dropTempView(&quot;test_view&quot;)</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.catalog</span> <span class="kn">import</span> <span class="n">Catalog</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">&quot;_catalog&quot;</span><span class="p">):</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_catalog</span> <span class="o">=</span> <span class="n">Catalog</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_catalog</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">udf</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;UDFRegistration&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a :class:`UDFRegistration` for UDF registration.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`UDFRegistration`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Register a Python UDF, and use it in SQL.</span>
<span class="sd"> &gt;&gt;&gt; strlen = spark.udf.register(&quot;strlen&quot;, lambda x: len(x))</span>
<span class="sd"> &gt;&gt;&gt; spark.sql(&quot;SELECT strlen(&#39;test&#39;)&quot;).show()</span>
<span class="sd"> +------------+</span>
<span class="sd"> |strlen(test)|</span>
<span class="sd"> +------------+</span>
<span class="sd"> | 4|</span>
<span class="sd"> +------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.udf</span> <span class="kn">import</span> <span class="n">UDFRegistration</span>
<span class="k">return</span> <span class="n">UDFRegistration</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">udtf</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;UDTFRegistration&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a :class:`UDTFRegistration` for UDTF registration.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`UDTFRegistration`</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.udtf</span> <span class="kn">import</span> <span class="n">UDTFRegistration</span>
<span class="k">return</span> <span class="n">UDTFRegistration</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">dataSource</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;DataSourceRegistration&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a :class:`DataSourceRegistration` for data source registration.</span>
<span class="sd"> .. versionadded:: 4.0.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataSourceRegistration`</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This feature is experimental and unstable.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.datasource</span> <span class="kn">import</span> <span class="n">DataSourceRegistration</span>
<span class="k">return</span> <span class="n">DataSourceRegistration</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">profile</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Profile</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a :class:`Profile` for performance/memory profiling.</span>
<span class="sd"> .. versionadded:: 4.0.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`Profile`</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">Profile</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_profiler_collector</span><span class="p">)</span>
<div class="viewcode-block" id="SparkSession.range"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.range.html#pyspark.sql.SparkSession.range">[docs]</a> <span class="k">def</span> <span class="nf">range</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">start</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span>
<span class="n">end</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">step</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span>
<span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Create a :class:`DataFrame` with single :class:`pyspark.sql.types.LongType` column named</span>
<span class="sd"> ``id``, containing elements in a range from ``start`` to ``end`` (exclusive) with</span>
<span class="sd"> step value ``step``.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> start : int</span>
<span class="sd"> the start value</span>
<span class="sd"> end : int, optional</span>
<span class="sd"> the end value (exclusive)</span>
<span class="sd"> step : int, optional</span>
<span class="sd"> the incremental step (default: 1)</span>
<span class="sd"> numPartitions : int, optional</span>
<span class="sd"> the number of partitions of the DataFrame</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.range(1, 7, 2).show()</span>
<span class="sd"> +---+</span>
<span class="sd"> | id|</span>
<span class="sd"> +---+</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 3|</span>
<span class="sd"> | 5|</span>
<span class="sd"> +---+</span>
<span class="sd"> If only one argument is specified, it will be used as the end value.</span>
<span class="sd"> &gt;&gt;&gt; spark.range(3).show()</span>
<span class="sd"> +---+</span>
<span class="sd"> | id|</span>
<span class="sd"> +---+</span>
<span class="sd"> | 0|</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 2|</span>
<span class="sd"> +---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">numPartitions</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">numPartitions</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="o">.</span><span class="n">defaultParallelism</span>
<span class="k">if</span> <span class="n">end</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="nb">int</span><span class="p">(</span><span class="n">start</span><span class="p">),</span> <span class="nb">int</span><span class="p">(</span><span class="n">step</span><span class="p">),</span> <span class="nb">int</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">))</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">start</span><span class="p">),</span> <span class="nb">int</span><span class="p">(</span><span class="n">end</span><span class="p">),</span> <span class="nb">int</span><span class="p">(</span><span class="n">step</span><span class="p">),</span> <span class="nb">int</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">))</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_inferSchemaFromList</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">data</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Any</span><span class="p">],</span> <span class="n">names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">StructType</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Infer schema from list of Row, dict, or tuple.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> data : iterable</span>
<span class="sd"> list of Row, dict, or tuple</span>
<span class="sd"> names : list, optional</span>
<span class="sd"> list of column names</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`pyspark.sql.types.StructType`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">data</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;CANNOT_INFER_EMPTY_SCHEMA&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{},</span>
<span class="p">)</span>
<span class="n">infer_dict_as_struct</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jconf</span><span class="o">.</span><span class="n">inferDictAsStruct</span><span class="p">()</span>
<span class="n">infer_array_from_first_element</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jconf</span><span class="o">.</span><span class="n">legacyInferArrayTypeFromFirstElement</span><span class="p">()</span>
<span class="n">infer_map_from_first_pair</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jconf</span><span class="o">.</span><span class="n">legacyInferMapStructTypeFromFirstItem</span><span class="p">()</span>
<span class="n">prefer_timestamp_ntz</span> <span class="o">=</span> <span class="n">is_timestamp_ntz_preferred</span><span class="p">()</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">reduce</span><span class="p">(</span>
<span class="n">_merge_type</span><span class="p">,</span>
<span class="p">(</span>
<span class="n">_infer_schema</span><span class="p">(</span>
<span class="n">row</span><span class="p">,</span>
<span class="n">names</span><span class="p">,</span>
<span class="n">infer_dict_as_struct</span><span class="o">=</span><span class="n">infer_dict_as_struct</span><span class="p">,</span>
<span class="n">infer_array_from_first_element</span><span class="o">=</span><span class="n">infer_array_from_first_element</span><span class="p">,</span>
<span class="n">infer_map_from_first_pair</span><span class="o">=</span><span class="n">infer_map_from_first_pair</span><span class="p">,</span>
<span class="n">prefer_timestamp_ntz</span><span class="o">=</span><span class="n">prefer_timestamp_ntz</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">for</span> <span class="n">row</span> <span class="ow">in</span> <span class="n">data</span>
<span class="p">),</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">_has_nulltype</span><span class="p">(</span><span class="n">schema</span><span class="p">):</span>
<span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;CANNOT_DETERMINE_TYPE&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{},</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">schema</span>
<span class="k">def</span> <span class="nf">_inferSchema</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">rdd</span><span class="p">:</span> <span class="s2">&quot;RDD[Any]&quot;</span><span class="p">,</span>
<span class="n">samplingRatio</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">StructType</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Infer schema from an RDD of Row, dict, or tuple.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> rdd : :class:`RDD`</span>
<span class="sd"> an RDD of Row, dict, or tuple</span>
<span class="sd"> samplingRatio : float, optional</span>
<span class="sd"> sampling ratio, or no sampling (default)</span>
<span class="sd"> names : list, optional</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`pyspark.sql.types.StructType`</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">first</span> <span class="o">=</span> <span class="n">rdd</span><span class="o">.</span><span class="n">first</span><span class="p">()</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">first</span><span class="p">,</span> <span class="n">Sized</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">first</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;CANNOT_INFER_EMPTY_SCHEMA&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{},</span>
<span class="p">)</span>
<span class="n">infer_dict_as_struct</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jconf</span><span class="o">.</span><span class="n">inferDictAsStruct</span><span class="p">()</span>
<span class="n">infer_array_from_first_element</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jconf</span><span class="o">.</span><span class="n">legacyInferArrayTypeFromFirstElement</span><span class="p">()</span>
<span class="n">infer_map_from_first_pair</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jconf</span><span class="o">.</span><span class="n">legacyInferMapStructTypeFromFirstItem</span><span class="p">()</span>
<span class="n">prefer_timestamp_ntz</span> <span class="o">=</span> <span class="n">is_timestamp_ntz_preferred</span><span class="p">()</span>
<span class="k">if</span> <span class="n">samplingRatio</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">_infer_schema</span><span class="p">(</span>
<span class="n">first</span><span class="p">,</span>
<span class="n">names</span><span class="o">=</span><span class="n">names</span><span class="p">,</span>
<span class="n">infer_dict_as_struct</span><span class="o">=</span><span class="n">infer_dict_as_struct</span><span class="p">,</span>
<span class="n">prefer_timestamp_ntz</span><span class="o">=</span><span class="n">prefer_timestamp_ntz</span><span class="p">,</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">_has_nulltype</span><span class="p">(</span><span class="n">schema</span><span class="p">):</span>
<span class="k">for</span> <span class="n">row</span> <span class="ow">in</span> <span class="n">rdd</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">100</span><span class="p">)[</span><span class="mi">1</span><span class="p">:]:</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">_merge_type</span><span class="p">(</span>
<span class="n">schema</span><span class="p">,</span>
<span class="n">_infer_schema</span><span class="p">(</span>
<span class="n">row</span><span class="p">,</span>
<span class="n">names</span><span class="o">=</span><span class="n">names</span><span class="p">,</span>
<span class="n">infer_dict_as_struct</span><span class="o">=</span><span class="n">infer_dict_as_struct</span><span class="p">,</span>
<span class="n">infer_array_from_first_element</span><span class="o">=</span><span class="n">infer_array_from_first_element</span><span class="p">,</span>
<span class="n">infer_map_from_first_pair</span><span class="o">=</span><span class="n">infer_map_from_first_pair</span><span class="p">,</span>
<span class="n">prefer_timestamp_ntz</span><span class="o">=</span><span class="n">prefer_timestamp_ntz</span><span class="p">,</span>
<span class="p">),</span>
<span class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">_has_nulltype</span><span class="p">(</span><span class="n">schema</span><span class="p">):</span>
<span class="k">break</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;CANNOT_DETERMINE_TYPE&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{},</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">if</span> <span class="n">samplingRatio</span> <span class="o">&lt;</span> <span class="mf">0.99</span><span class="p">:</span>
<span class="n">rdd</span> <span class="o">=</span> <span class="n">rdd</span><span class="o">.</span><span class="n">sample</span><span class="p">(</span><span class="kc">False</span><span class="p">,</span> <span class="nb">float</span><span class="p">(</span><span class="n">samplingRatio</span><span class="p">))</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span>
<span class="k">lambda</span> <span class="n">row</span><span class="p">:</span> <span class="n">_infer_schema</span><span class="p">(</span>
<span class="n">row</span><span class="p">,</span>
<span class="n">names</span><span class="p">,</span>
<span class="n">infer_dict_as_struct</span><span class="o">=</span><span class="n">infer_dict_as_struct</span><span class="p">,</span>
<span class="n">infer_array_from_first_element</span><span class="o">=</span><span class="n">infer_array_from_first_element</span><span class="p">,</span>
<span class="n">infer_map_from_first_pair</span><span class="o">=</span><span class="n">infer_map_from_first_pair</span><span class="p">,</span>
<span class="n">prefer_timestamp_ntz</span><span class="o">=</span><span class="n">prefer_timestamp_ntz</span><span class="p">,</span>
<span class="p">)</span>
<span class="p">)</span><span class="o">.</span><span class="n">reduce</span><span class="p">(</span><span class="n">_merge_type</span><span class="p">)</span>
<span class="k">return</span> <span class="n">schema</span>
<span class="k">def</span> <span class="nf">_createFromRDD</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">rdd</span><span class="p">:</span> <span class="s2">&quot;RDD[Any]&quot;</span><span class="p">,</span>
<span class="n">schema</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">DataType</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]],</span>
<span class="n">samplingRatio</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">&quot;RDD[Tuple]&quot;</span><span class="p">,</span> <span class="n">StructType</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Create an RDD for DataFrame from an existing RDD, returns the RDD and schema.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="n">schema</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span>
<span class="n">struct</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_inferSchema</span><span class="p">(</span><span class="n">rdd</span><span class="p">,</span> <span class="n">samplingRatio</span><span class="p">,</span> <span class="n">names</span><span class="o">=</span><span class="n">schema</span><span class="p">)</span>
<span class="n">converter</span> <span class="o">=</span> <span class="n">_create_converter</span><span class="p">(</span><span class="n">struct</span><span class="p">)</span>
<span class="n">tupled_rdd</span> <span class="o">=</span> <span class="n">rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">converter</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">schema</span><span class="p">):</span>
<span class="n">struct</span><span class="o">.</span><span class="n">fields</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span>
<span class="n">struct</span><span class="o">.</span><span class="n">names</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="n">name</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">StructType</span><span class="p">):</span>
<span class="n">struct</span> <span class="o">=</span> <span class="n">schema</span>
<span class="n">tupled_rdd</span> <span class="o">=</span> <span class="n">rdd</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;NOT_LIST_OR_NONE_OR_STRUCT&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span>
<span class="s2">&quot;arg_name&quot;</span><span class="p">:</span> <span class="s2">&quot;schema&quot;</span><span class="p">,</span>
<span class="s2">&quot;arg_type&quot;</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">,</span>
<span class="p">},</span>
<span class="p">)</span>
<span class="c1"># convert python objects to sql data</span>
<span class="n">internal_rdd</span> <span class="o">=</span> <span class="n">tupled_rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">struct</span><span class="o">.</span><span class="n">toInternal</span><span class="p">)</span>
<span class="k">return</span> <span class="n">internal_rdd</span><span class="p">,</span> <span class="n">struct</span>
<span class="k">def</span> <span class="nf">_createFromLocal</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">data</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Any</span><span class="p">],</span> <span class="n">schema</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">DataType</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">&quot;RDD[Tuple]&quot;</span><span class="p">,</span> <span class="n">StructType</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Create an RDD for DataFrame from a list or pandas.DataFrame, returns</span>
<span class="sd"> the RDD and schema.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="c1"># make sure data could consumed multiple times</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span>
<span class="n">data</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">data</span><span class="p">)</span>
<span class="k">if</span> <span class="n">schema</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span>
<span class="n">struct</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_inferSchemaFromList</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">names</span><span class="o">=</span><span class="n">schema</span><span class="p">)</span>
<span class="n">converter</span> <span class="o">=</span> <span class="n">_create_converter</span><span class="p">(</span><span class="n">struct</span><span class="p">)</span>
<span class="n">tupled_data</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">]</span> <span class="o">=</span> <span class="nb">map</span><span class="p">(</span><span class="n">converter</span><span class="p">,</span> <span class="n">data</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span>
<span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">schema</span><span class="p">):</span>
<span class="n">struct</span><span class="o">.</span><span class="n">fields</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span>
<span class="n">struct</span><span class="o">.</span><span class="n">names</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="n">name</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">StructType</span><span class="p">):</span>
<span class="n">struct</span> <span class="o">=</span> <span class="n">schema</span>
<span class="n">tupled_data</span> <span class="o">=</span> <span class="n">data</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;NOT_LIST_OR_NONE_OR_STRUCT&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span>
<span class="s2">&quot;arg_name&quot;</span><span class="p">:</span> <span class="s2">&quot;schema&quot;</span><span class="p">,</span>
<span class="s2">&quot;arg_type&quot;</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">,</span>
<span class="p">},</span>
<span class="p">)</span>
<span class="c1"># convert python objects to sql data</span>
<span class="n">internal_data</span> <span class="o">=</span> <span class="p">[</span><span class="n">struct</span><span class="o">.</span><span class="n">toInternal</span><span class="p">(</span><span class="n">row</span><span class="p">)</span> <span class="k">for</span> <span class="n">row</span> <span class="ow">in</span> <span class="n">tupled_data</span><span class="p">]</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">internal_data</span><span class="p">),</span> <span class="n">struct</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_create_shell_session</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="s2">&quot;SparkSession&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Initialize a :class:`SparkSession` for a pyspark shell session. This is called from</span>
<span class="sd"> shell.py to make error handling simpler without needing to declare local variables in</span>
<span class="sd"> that script, which would expose those to users.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">import</span> <span class="nn">py4j</span>
<span class="kn">from</span> <span class="nn">pyspark.core.context</span> <span class="kn">import</span> <span class="n">SparkContext</span>
<span class="k">try</span><span class="p">:</span>
<span class="c1"># Try to access HiveConf, it will raise exception if Hive is not added</span>
<span class="n">conf</span> <span class="o">=</span> <span class="n">SparkConf</span><span class="p">()</span>
<span class="k">assert</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="k">if</span> <span class="n">conf</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;spark.sql.catalogImplementation&quot;</span><span class="p">,</span> <span class="s2">&quot;hive&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="o">==</span> <span class="s2">&quot;hive&quot;</span><span class="p">:</span>
<span class="n">SparkContext</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">hadoop</span><span class="o">.</span><span class="n">hive</span><span class="o">.</span><span class="n">conf</span><span class="o">.</span><span class="n">HiveConf</span><span class="p">()</span>
<span class="k">return</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">enableHiveSupport</span><span class="p">()</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">_getActiveSessionOrCreate</span><span class="p">()</span>
<span class="k">except</span> <span class="p">(</span><span class="n">py4j</span><span class="o">.</span><span class="n">protocol</span><span class="o">.</span><span class="n">Py4JError</span><span class="p">,</span> <span class="ne">TypeError</span><span class="p">):</span>
<span class="k">if</span> <span class="n">conf</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">&quot;spark.sql.catalogImplementation&quot;</span><span class="p">,</span> <span class="s2">&quot;&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="o">==</span> <span class="s2">&quot;hive&quot;</span><span class="p">:</span>
<span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span>
<span class="s2">&quot;Fall back to non-hive support because failing to access HiveConf, &quot;</span>
<span class="s2">&quot;please make sure you build spark with hive&quot;</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">_getActiveSessionOrCreate</span><span class="p">()</span>
<span class="nd">@staticmethod</span>
<span class="k">def</span> <span class="nf">_getActiveSessionOrCreate</span><span class="p">(</span><span class="o">**</span><span class="n">static_conf</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SparkSession&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns the active :class:`SparkSession` for the current thread, returned by the builder,</span>
<span class="sd"> or if there is no existing one, creates a new one based on the options set in the builder.</span>
<span class="sd"> NOTE that &#39;static_conf&#39; might not be set if there&#39;s an active or default Spark session</span>
<span class="sd"> running.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">getActiveSession</span><span class="p">()</span>
<span class="k">if</span> <span class="n">spark</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">builder</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span>
<span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">static_conf</span><span class="o">.</span><span class="n">items</span><span class="p">():</span>
<span class="n">builder</span> <span class="o">=</span> <span class="n">builder</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="n">k</span><span class="p">,</span> <span class="n">v</span><span class="p">)</span>
<span class="n">spark</span> <span class="o">=</span> <span class="n">builder</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="k">return</span> <span class="n">spark</span>
<span class="nd">@overload</span> <span class="c1"># type: ignore[override]</span>
<span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">data</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="s2">&quot;RowLike&quot;</span><span class="p">],</span>
<span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="o">...</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">samplingRatio</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">data</span><span class="p">:</span> <span class="s2">&quot;RDD[RowLike]&quot;</span><span class="p">,</span>
<span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="o">...</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="n">samplingRatio</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">data</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="s2">&quot;RowLike&quot;</span><span class="p">],</span>
<span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">StructType</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">verifySchema</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">data</span><span class="p">:</span> <span class="s2">&quot;RDD[RowLike]&quot;</span><span class="p">,</span>
<span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">StructType</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span>
<span class="o">*</span><span class="p">,</span>
<span class="n">verifySchema</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">data</span><span class="p">:</span> <span class="s2">&quot;RDD[AtomicValue]&quot;</span><span class="p">,</span>
<span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">AtomicType</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span>
<span class="n">verifySchema</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">data</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="s2">&quot;AtomicValue&quot;</span><span class="p">],</span>
<span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">AtomicType</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span>
<span class="n">verifySchema</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">data</span><span class="p">:</span> <span class="s2">&quot;PandasDataFrameLike&quot;</span><span class="p">,</span> <span class="n">samplingRatio</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">data</span><span class="p">:</span> <span class="s2">&quot;pa.Table&quot;</span><span class="p">,</span> <span class="n">samplingRatio</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">data</span><span class="p">:</span> <span class="s2">&quot;PandasDataFrameLike&quot;</span><span class="p">,</span>
<span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">StructType</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span>
<span class="n">verifySchema</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span>
<span class="nd">@overload</span>
<span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">data</span><span class="p">:</span> <span class="s2">&quot;pa.Table&quot;</span><span class="p">,</span>
<span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">StructType</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span>
<span class="n">verifySchema</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="o">...</span>
<div class="viewcode-block" id="SparkSession.createDataFrame"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.createDataFrame.html#pyspark.sql.SparkSession.createDataFrame">[docs]</a> <span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span> <span class="c1"># type: ignore[misc]</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">data</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;RDD[Any]&quot;</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Any</span><span class="p">],</span> <span class="s2">&quot;PandasDataFrameLike&quot;</span><span class="p">,</span> <span class="s2">&quot;ArrayLike&quot;</span><span class="p">,</span> <span class="s2">&quot;pa.Table&quot;</span><span class="p">],</span>
<span class="n">schema</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">AtomicType</span><span class="p">,</span> <span class="n">StructType</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">samplingRatio</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span>
<span class="n">verifySchema</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Creates a :class:`DataFrame` from an :class:`RDD`, a list, a :class:`pandas.DataFrame`,</span>
<span class="sd"> a :class:`numpy.ndarray`, or a :class:`pyarrow.Table`.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> .. versionchanged:: 4.0.0</span>
<span class="sd"> Supports :class:`pyarrow.Table`.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> data : :class:`RDD` or iterable</span>
<span class="sd"> an RDD of any kind of SQL data representation (:class:`Row`,</span>
<span class="sd"> :class:`tuple`, ``int``, ``boolean``, ``dict``, etc.), or :class:`list`,</span>
<span class="sd"> :class:`pandas.DataFrame`, :class:`numpy.ndarray`, or :class:`pyarrow.Table`.</span>
<span class="sd"> schema : :class:`pyspark.sql.types.DataType`, str or list, optional</span>
<span class="sd"> a :class:`pyspark.sql.types.DataType` or a datatype string or a list of</span>
<span class="sd"> column names, default is None. The data type string format equals to</span>
<span class="sd"> :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can</span>
<span class="sd"> omit the ``struct&lt;&gt;``.</span>
<span class="sd"> When ``schema`` is a list of column names, the type of each column</span>
<span class="sd"> will be inferred from ``data``.</span>
<span class="sd"> When ``schema`` is ``None``, it will try to infer the schema (column names and types)</span>
<span class="sd"> from ``data``, which should be an RDD of either :class:`Row`,</span>
<span class="sd"> :class:`namedtuple`, or :class:`dict`.</span>
<span class="sd"> When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must</span>
<span class="sd"> match the real data, or an exception will be thrown at runtime. If the given schema is</span>
<span class="sd"> not :class:`pyspark.sql.types.StructType`, it will be wrapped into a</span>
<span class="sd"> :class:`pyspark.sql.types.StructType` as its only field, and the field name will be</span>
<span class="sd"> &quot;value&quot;. Each record will also be wrapped into a tuple, which can be converted to row</span>
<span class="sd"> later.</span>
<span class="sd"> samplingRatio : float, optional</span>
<span class="sd"> the sample ratio of rows used for inferring. The first few rows will be used</span>
<span class="sd"> if ``samplingRatio`` is ``None``. This option is effective only when the input is</span>
<span class="sd"> :class:`RDD`.</span>
<span class="sd"> verifySchema : bool, optional</span>
<span class="sd"> verify data types of every row against schema. Enabled by default.</span>
<span class="sd"> When the input is :class:`pyarrow.Table` or when the input class is</span>
<span class="sd"> :class:`pandas.DataFrame` and `spark.sql.execution.arrow.pyspark.enabled` is enabled,</span>
<span class="sd"> this option is not effective. It follows Arrow type coercion. This option is not</span>
<span class="sd"> supported with Spark Connect.</span>
<span class="sd"> .. versionadded:: 2.1.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> Usage with `spark.sql.execution.arrow.pyspark.enabled=True` is experimental.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Create a DataFrame from a list of tuples.</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;Alice&#39;, 1)]).show()</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> | _1| _2|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> |Alice| 1|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> Create a DataFrame from a list of dictionaries.</span>
<span class="sd"> &gt;&gt;&gt; d = [{&#39;name&#39;: &#39;Alice&#39;, &#39;age&#39;: 1}]</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(d).show()</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> | 1|Alice|</span>
<span class="sd"> +---+-----+</span>
<span class="sd"> Create a DataFrame with column names specified.</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;Alice&#39;, 1)], [&#39;name&#39;, &#39;age&#39;]).show()</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> | name|age|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> |Alice| 1|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> Create a DataFrame with the explicit schema specified.</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.types import *</span>
<span class="sd"> &gt;&gt;&gt; schema = StructType([</span>
<span class="sd"> ... StructField(&quot;name&quot;, StringType(), True),</span>
<span class="sd"> ... StructField(&quot;age&quot;, IntegerType(), True)])</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;Alice&#39;, 1)], schema).show()</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> | name|age|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> |Alice| 1|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> Create a DataFrame with the schema in DDL formatted string.</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([(&#39;Alice&#39;, 1)], &quot;name: string, age: int&quot;).show()</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> | name|age|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> |Alice| 1|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> Create an empty DataFrame.</span>
<span class="sd"> When initializing an empty DataFrame in PySpark, it&#39;s mandatory to specify its schema,</span>
<span class="sd"> as the DataFrame lacks data from which the schema can be inferred.</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame([], &quot;name: string, age: int&quot;).show()</span>
<span class="sd"> +----+---+</span>
<span class="sd"> |name|age|</span>
<span class="sd"> +----+---+</span>
<span class="sd"> +----+---+</span>
<span class="sd"> Create a DataFrame from Row objects.</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql import Row</span>
<span class="sd"> &gt;&gt;&gt; Person = Row(&#39;name&#39;, &#39;age&#39;)</span>
<span class="sd"> &gt;&gt;&gt; df = spark.createDataFrame([Person(&quot;Alice&quot;, 1)])</span>
<span class="sd"> &gt;&gt;&gt; df.show()</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> | name|age|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> |Alice| 1|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> Create a DataFrame from a pandas DataFrame.</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(df.toPandas()).show() # doctest: +SKIP</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> | name|age|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> |Alice| 1|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | 0| 1|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | 1| 2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> Create a DataFrame from a PyArrow Table.</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(df.toArrow()).show() # doctest: +SKIP</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> | name|age|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> |Alice| 1|</span>
<span class="sd"> +-----+---+</span>
<span class="sd"> &gt;&gt;&gt; table = pyarrow.table({&#39;0&#39;: [1], &#39;1&#39;: [2]}) # doctest: +SKIP</span>
<span class="sd"> &gt;&gt;&gt; spark.createDataFrame(table).collect() # doctest: +SKIP</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | 0| 1|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | 1| 2|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="n">SparkSession</span><span class="o">.</span><span class="n">_activeSession</span> <span class="o">=</span> <span class="bp">self</span>
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">setActiveSession</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">):</span>
<span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;INVALID_TYPE&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;arg_name&quot;</span><span class="p">:</span> <span class="s2">&quot;data&quot;</span><span class="p">,</span> <span class="s2">&quot;arg_type&quot;</span><span class="p">:</span> <span class="s2">&quot;DataFrame&quot;</span><span class="p">},</span>
<span class="p">)</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">AtomicType</span><span class="p">,</span> <span class="n">StructType</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> <span class="n">_parse_datatype_string</span><span class="p">(</span><span class="n">schema</span><span class="p">))</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span>
<span class="c1"># Must re-encode any unicode strings to be consistent with StructField names</span>
<span class="n">schema</span> <span class="o">=</span> <span class="p">[</span><span class="n">x</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s2">&quot;utf-8&quot;</span><span class="p">)</span> <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">schema</span><span class="p">]</span>
<span class="k">try</span><span class="p">:</span>
<span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<span class="n">has_pandas</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">except</span> <span class="ne">Exception</span><span class="p">:</span>
<span class="n">has_pandas</span> <span class="o">=</span> <span class="kc">False</span>
<span class="k">try</span><span class="p">:</span>
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
<span class="n">has_numpy</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">except</span> <span class="ne">Exception</span><span class="p">:</span>
<span class="n">has_numpy</span> <span class="o">=</span> <span class="kc">False</span>
<span class="k">try</span><span class="p">:</span>
<span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span>
<span class="n">has_pyarrow</span> <span class="o">=</span> <span class="kc">True</span>
<span class="k">except</span> <span class="ne">Exception</span><span class="p">:</span>
<span class="n">has_pyarrow</span> <span class="o">=</span> <span class="kc">False</span>
<span class="k">if</span> <span class="n">has_numpy</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">):</span>
<span class="c1"># `data` of numpy.ndarray type will be converted to a pandas DataFrame,</span>
<span class="c1"># so pandas is required.</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.pandas.utils</span> <span class="kn">import</span> <span class="n">require_minimum_pandas_version</span>
<span class="n">require_minimum_pandas_version</span><span class="p">()</span>
<span class="k">if</span> <span class="n">data</span><span class="o">.</span><span class="n">ndim</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">]:</span>
<span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;INVALID_NDARRAY_DIMENSION&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;dimensions&quot;</span><span class="p">:</span> <span class="s2">&quot;1 or 2&quot;</span><span class="p">},</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">data</span><span class="o">.</span><span class="n">ndim</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">or</span> <span class="n">data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
<span class="n">column_names</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;value&quot;</span><span class="p">]</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">column_names</span> <span class="o">=</span> <span class="p">[</span><span class="s2">&quot;_</span><span class="si">%s</span><span class="s2">&quot;</span> <span class="o">%</span> <span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)]</span>
<span class="k">if</span> <span class="n">schema</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jconf</span><span class="o">.</span><span class="n">arrowPySparkEnabled</span><span class="p">():</span>
<span class="c1"># Construct `schema` from `np.dtype` of the input NumPy array</span>
<span class="c1"># TODO: Apply the logic below when self._jconf.arrowPySparkEnabled() is True</span>
<span class="n">spark_type</span> <span class="o">=</span> <span class="n">_from_numpy_type</span><span class="p">(</span><span class="n">data</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span>
<span class="k">if</span> <span class="n">spark_type</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">(</span>
<span class="p">[</span><span class="n">StructField</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">spark_type</span><span class="p">,</span> <span class="n">nullable</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">column_names</span><span class="p">]</span>
<span class="p">)</span>
<span class="n">data</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="n">column_names</span><span class="p">)</span>
<span class="k">if</span> <span class="n">has_pandas</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span>
<span class="c1"># Create a DataFrame from pandas DataFrame.</span>
<span class="k">return</span> <span class="nb">super</span><span class="p">(</span><span class="n">SparkSession</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span> <span class="c1"># type: ignore[call-overload]</span>
<span class="n">data</span><span class="p">,</span> <span class="n">schema</span><span class="p">,</span> <span class="n">samplingRatio</span><span class="p">,</span> <span class="n">verifySchema</span>
<span class="p">)</span>
<span class="k">if</span> <span class="n">has_pyarrow</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">Table</span><span class="p">):</span>
<span class="c1"># Create a DataFrame from PyArrow Table.</span>
<span class="k">return</span> <span class="nb">super</span><span class="p">(</span><span class="n">SparkSession</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span> <span class="c1"># type: ignore[call-overload]</span>
<span class="n">data</span><span class="p">,</span> <span class="n">schema</span><span class="p">,</span> <span class="n">samplingRatio</span><span class="p">,</span> <span class="n">verifySchema</span>
<span class="p">)</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_create_dataframe</span><span class="p">(</span>
<span class="n">data</span><span class="p">,</span> <span class="n">schema</span><span class="p">,</span> <span class="n">samplingRatio</span><span class="p">,</span> <span class="n">verifySchema</span> <span class="c1"># type: ignore[arg-type]</span>
<span class="p">)</span></div>
<span class="k">def</span> <span class="nf">_create_dataframe</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">data</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">&quot;RDD[Any]&quot;</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Any</span><span class="p">]],</span>
<span class="n">schema</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">DataType</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]],</span>
<span class="n">samplingRatio</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span>
<span class="n">verifySchema</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">StructType</span><span class="p">):</span>
<span class="n">verify_func</span> <span class="o">=</span> <span class="n">_make_type_verifier</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span> <span class="k">if</span> <span class="n">verifySchema</span> <span class="k">else</span> <span class="k">lambda</span> <span class="n">_</span><span class="p">:</span> <span class="kc">True</span>
<span class="nd">@no_type_check</span>
<span class="k">def</span> <span class="nf">prepare</span><span class="p">(</span><span class="n">obj</span><span class="p">):</span>
<span class="n">verify_func</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span>
<span class="k">return</span> <span class="n">obj</span>
<span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">DataType</span><span class="p">):</span>
<span class="n">dataType</span> <span class="o">=</span> <span class="n">schema</span>
<span class="n">schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">()</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s2">&quot;value&quot;</span><span class="p">,</span> <span class="n">schema</span><span class="p">)</span>
<span class="n">verify_func</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">_make_type_verifier</span><span class="p">(</span><span class="n">dataType</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">&quot;field value&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="n">verifySchema</span>
<span class="k">else</span> <span class="k">lambda</span> <span class="n">_</span><span class="p">:</span> <span class="kc">True</span>
<span class="p">)</span>
<span class="nd">@no_type_check</span>
<span class="k">def</span> <span class="nf">prepare</span><span class="p">(</span><span class="n">obj</span><span class="p">):</span>
<span class="n">verify_func</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span>
<span class="k">return</span> <span class="p">(</span><span class="n">obj</span><span class="p">,)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">def</span> <span class="nf">prepare</span><span class="p">(</span><span class="n">obj</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Any</span><span class="p">:</span>
<span class="k">return</span> <span class="n">obj</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_remote_only</span><span class="p">():</span>
<span class="kn">from</span> <span class="nn">pyspark.core.rdd</span> <span class="kn">import</span> <span class="n">RDD</span>
<span class="k">if</span> <span class="ow">not</span> <span class="n">is_remote_only</span><span class="p">()</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">RDD</span><span class="p">):</span>
<span class="n">rdd</span><span class="p">,</span> <span class="n">struct</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_createFromRDD</span><span class="p">(</span><span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">prepare</span><span class="p">),</span> <span class="n">schema</span><span class="p">,</span> <span class="n">samplingRatio</span><span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="n">rdd</span><span class="p">,</span> <span class="n">struct</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_createFromLocal</span><span class="p">(</span>
<span class="nb">map</span><span class="p">(</span><span class="n">prepare</span><span class="p">,</span> <span class="n">data</span><span class="p">),</span> <span class="n">schema</span> <span class="c1"># type: ignore[arg-type]</span>
<span class="p">)</span>
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="n">jrdd</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SerDeUtil</span><span class="o">.</span><span class="n">toJavaArray</span><span class="p">(</span><span class="n">rdd</span><span class="o">.</span><span class="n">_to_java_object_rdd</span><span class="p">())</span>
<span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">applySchemaToPythonRDD</span><span class="p">(</span><span class="n">jrdd</span><span class="o">.</span><span class="n">rdd</span><span class="p">(),</span> <span class="n">struct</span><span class="o">.</span><span class="n">json</span><span class="p">())</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span>
<span class="n">df</span><span class="o">.</span><span class="n">_schema</span> <span class="o">=</span> <span class="n">struct</span>
<span class="k">return</span> <span class="n">df</span>
<div class="viewcode-block" id="SparkSession.sql"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.sql.html#pyspark.sql.SparkSession.sql">[docs]</a> <span class="k">def</span> <span class="nf">sql</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="n">sqlQuery</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">args</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> <span class="n">List</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a :class:`DataFrame` representing the result of the given query.</span>
<span class="sd"> When ``kwargs`` is specified, this method formats the given string by using the Python</span>
<span class="sd"> standard formatter. The method binds named parameters to SQL literals or</span>
<span class="sd"> positional parameters from `args`. It doesn&#39;t support named and positional parameters</span>
<span class="sd"> in the same SQL query.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect and parameterized SQL.</span>
<span class="sd"> .. versionchanged:: 3.5.0</span>
<span class="sd"> Added positional parameters.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> sqlQuery : str</span>
<span class="sd"> SQL query string.</span>
<span class="sd"> args : dict or list</span>
<span class="sd"> A dictionary of parameter names to Python objects or a list of Python objects</span>
<span class="sd"> that can be converted to SQL literal expressions. See</span>
<span class="sd"> `Supported Data Types`_ for supported value types in Python.</span>
<span class="sd"> For example, dictionary keys: &quot;rank&quot;, &quot;name&quot;, &quot;birthdate&quot;;</span>
<span class="sd"> dictionary or list values: 1, &quot;Steven&quot;, datetime.date(2023, 4, 2).</span>
<span class="sd"> A value can be also a `Column` of a literal or collection constructor functions such</span>
<span class="sd"> as `map()`, `array()`, `struct()`, in that case it is taken as is.</span>
<span class="sd"> .. _Supported Data Types: https://spark.apache.org/docs/latest/sql-ref-datatypes.html</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> kwargs : dict</span>
<span class="sd"> Other variables that the user wants to set that can be referenced in the query</span>
<span class="sd"> .. versionchanged:: 3.3.0</span>
<span class="sd"> Added optional argument ``kwargs`` to specify the mapping of variables in the query.</span>
<span class="sd"> This feature is experimental and unstable.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> In Spark Classic, a temporary view referenced in `spark.sql` is resolved immediately,</span>
<span class="sd"> while in Spark Connect it is lazily analyzed.</span>
<span class="sd"> So in Spark Connect if a view is dropped, modified or replaced after `spark.sql`, the</span>
<span class="sd"> execution may fail or generate different results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> Executing a SQL query.</span>
<span class="sd"> &gt;&gt;&gt; spark.sql(&quot;SELECT * FROM range(10) where id &gt; 7&quot;).show()</span>
<span class="sd"> +---+</span>
<span class="sd"> | id|</span>
<span class="sd"> +---+</span>
<span class="sd"> | 8|</span>
<span class="sd"> | 9|</span>
<span class="sd"> +---+</span>
<span class="sd"> Executing a SQL query with variables as Python formatter standard.</span>
<span class="sd"> &gt;&gt;&gt; spark.sql(</span>
<span class="sd"> ... &quot;SELECT * FROM range(10) WHERE id &gt; {bound1} AND id &lt; {bound2}&quot;, bound1=7, bound2=9</span>
<span class="sd"> ... ).show()</span>
<span class="sd"> +---+</span>
<span class="sd"> | id|</span>
<span class="sd"> +---+</span>
<span class="sd"> | 8|</span>
<span class="sd"> +---+</span>
<span class="sd"> &gt;&gt;&gt; mydf = spark.range(10)</span>
<span class="sd"> &gt;&gt;&gt; spark.sql(</span>
<span class="sd"> ... &quot;SELECT {col} FROM {mydf} WHERE id IN {x}&quot;,</span>
<span class="sd"> ... col=mydf.id, mydf=mydf, x=tuple(range(4))).show()</span>
<span class="sd"> +---+</span>
<span class="sd"> | id|</span>
<span class="sd"> +---+</span>
<span class="sd"> | 0|</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 2|</span>
<span class="sd"> | 3|</span>
<span class="sd"> +---+</span>
<span class="sd"> &gt;&gt;&gt; spark.sql(&#39;&#39;&#39;</span>
<span class="sd"> ... SELECT m1.a, m2.b</span>
<span class="sd"> ... FROM {table1} m1 INNER JOIN {table2} m2</span>
<span class="sd"> ... ON m1.key = m2.key</span>
<span class="sd"> ... ORDER BY m1.a, m2.b&#39;&#39;&#39;,</span>
<span class="sd"> ... table1=spark.createDataFrame([(1, &quot;a&quot;), (2, &quot;b&quot;)], [&quot;a&quot;, &quot;key&quot;]),</span>
<span class="sd"> ... table2=spark.createDataFrame([(3, &quot;a&quot;), (4, &quot;b&quot;), (5, &quot;b&quot;)], [&quot;b&quot;, &quot;key&quot;])).show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | a| b|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | 1| 3|</span>
<span class="sd"> | 2| 4|</span>
<span class="sd"> | 2| 5|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> Also, it is possible to query using class:`Column` from :class:`DataFrame`.</span>
<span class="sd"> &gt;&gt;&gt; mydf = spark.createDataFrame([(1, 4), (2, 4), (3, 6)], [&quot;A&quot;, &quot;B&quot;])</span>
<span class="sd"> &gt;&gt;&gt; spark.sql(&quot;SELECT {df.A}, {df[B]} FROM {df}&quot;, df=mydf).show()</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | A| B|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> | 1| 4|</span>
<span class="sd"> | 2| 4|</span>
<span class="sd"> | 3| 6|</span>
<span class="sd"> +---+---+</span>
<span class="sd"> And substitute named parameters with the `:` prefix by SQL literals.</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import create_map, lit</span>
<span class="sd"> &gt;&gt;&gt; spark.sql(</span>
<span class="sd"> ... &quot;SELECT *, element_at(:m, &#39;a&#39;) AS C FROM {df} WHERE {df[B]} &gt; :minB&quot;,</span>
<span class="sd"> ... {&quot;minB&quot; : 5, &quot;m&quot; : create_map(lit(&#39;a&#39;), lit(1))}, df=mydf).show()</span>
<span class="sd"> +---+---+---+</span>
<span class="sd"> | A| B| C|</span>
<span class="sd"> +---+---+---+</span>
<span class="sd"> | 3| 6| 1|</span>
<span class="sd"> +---+---+---+</span>
<span class="sd"> Or positional parameters marked by `?` in the SQL query by SQL literals.</span>
<span class="sd"> &gt;&gt;&gt; from pyspark.sql.functions import array, lit</span>
<span class="sd"> &gt;&gt;&gt; spark.sql(</span>
<span class="sd"> ... &quot;SELECT *, element_at(?, 1) AS C FROM {df} WHERE {df[B]} &gt; ? and ? &lt; {df[A]}&quot;,</span>
<span class="sd"> ... args=[array(lit(1), lit(2), lit(3)), 5, 2], df=mydf).show()</span>
<span class="sd"> +---+---+---+</span>
<span class="sd"> | A| B| C|</span>
<span class="sd"> +---+---+---+</span>
<span class="sd"> | 3| 6| 1|</span>
<span class="sd"> +---+---+---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.classic.column</span> <span class="kn">import</span> <span class="n">_to_java_column</span>
<span class="n">formatter</span> <span class="o">=</span> <span class="n">SQLStringFormatter</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">kwargs</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">sqlQuery</span> <span class="o">=</span> <span class="n">formatter</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">sqlQuery</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span>
<span class="k">try</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">args</span><span class="p">,</span> <span class="n">Dict</span><span class="p">):</span>
<span class="n">litArgs</span> <span class="o">=</span> <span class="p">{</span><span class="n">k</span><span class="p">:</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">lit</span><span class="p">(</span><span class="n">v</span><span class="p">))</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="p">(</span><span class="n">args</span> <span class="ow">or</span> <span class="p">{})</span><span class="o">.</span><span class="n">items</span><span class="p">()}</span>
<span class="k">elif</span> <span class="n">args</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">args</span><span class="p">,</span> <span class="n">List</span><span class="p">):</span>
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="n">litArgs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">PythonUtils</span><span class="o">.</span><span class="n">toArray</span><span class="p">(</span>
<span class="p">[</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">lit</span><span class="p">(</span><span class="n">v</span><span class="p">))</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="p">(</span><span class="n">args</span> <span class="ow">or</span> <span class="p">[])]</span>
<span class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;INVALID_TYPE&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;arg_name&quot;</span><span class="p">:</span> <span class="s2">&quot;args&quot;</span><span class="p">,</span> <span class="s2">&quot;arg_type&quot;</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">args</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="n">sqlQuery</span><span class="p">,</span> <span class="n">litArgs</span><span class="p">),</span> <span class="bp">self</span><span class="p">)</span>
<span class="k">finally</span><span class="p">:</span>
<span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">kwargs</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">:</span>
<span class="n">formatter</span><span class="o">.</span><span class="n">clear</span><span class="p">()</span></div>
<div class="viewcode-block" id="SparkSession.table"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.table.html#pyspark.sql.SparkSession.table">[docs]</a> <span class="k">def</span> <span class="nf">table</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tableName</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrame</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns the specified table as a :class:`DataFrame`.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> tableName : str</span>
<span class="sd"> the table name to retrieve.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrame`</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> In Spark Classic, a temporary view referenced in `spark.table` is resolved immediately,</span>
<span class="sd"> while in Spark Connect it is lazily analyzed.</span>
<span class="sd"> So in Spark Connect if a view is dropped, modified or replaced after `spark.table`, the</span>
<span class="sd"> execution may fail or generate different results.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.range(5).createOrReplaceTempView(&quot;table1&quot;)</span>
<span class="sd"> &gt;&gt;&gt; spark.table(&quot;table1&quot;).sort(&quot;id&quot;).show()</span>
<span class="sd"> +---+</span>
<span class="sd"> | id|</span>
<span class="sd"> +---+</span>
<span class="sd"> | 0|</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 2|</span>
<span class="sd"> | 3|</span>
<span class="sd"> | 4|</span>
<span class="sd"> +---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">tableName</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span>
<span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;NOT_STR&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;arg_name&quot;</span><span class="p">:</span> <span class="s2">&quot;tableName&quot;</span><span class="p">,</span> <span class="s2">&quot;arg_type&quot;</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">tableName</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span>
<span class="p">)</span>
<span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">table</span><span class="p">(</span><span class="n">tableName</span><span class="p">),</span> <span class="bp">self</span><span class="p">)</span></div>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">read</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataFrameReader</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a :class:`DataFrameReader` that can be used to read data</span>
<span class="sd"> in as a :class:`DataFrame`.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataFrameReader`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.read</span>
<span class="sd"> &lt;...DataFrameReader object ...&gt;</span>
<span class="sd"> Write a DataFrame into a JSON file and read it back.</span>
<span class="sd"> &gt;&gt;&gt; import tempfile</span>
<span class="sd"> &gt;&gt;&gt; with tempfile.TemporaryDirectory(prefix=&quot;read&quot;) as d:</span>
<span class="sd"> ... # Write a DataFrame into a JSON file</span>
<span class="sd"> ... spark.createDataFrame(</span>
<span class="sd"> ... [{&quot;age&quot;: 100, &quot;name&quot;: &quot;Hyukjin Kwon&quot;}]</span>
<span class="sd"> ... ).write.mode(&quot;overwrite&quot;).format(&quot;json&quot;).save(d)</span>
<span class="sd"> ...</span>
<span class="sd"> ... # Read the JSON file as a DataFrame.</span>
<span class="sd"> ... spark.read.format(&#39;json&#39;).load(d).show()</span>
<span class="sd"> +---+------------+</span>
<span class="sd"> |age| name|</span>
<span class="sd"> +---+------------+</span>
<span class="sd"> |100|Hyukjin Kwon|</span>
<span class="sd"> +---+------------+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">DataFrameReader</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">readStream</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">DataStreamReader</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Returns a :class:`DataStreamReader` that can be used to read data streams</span>
<span class="sd"> as a streaming :class:`DataFrame`.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.5.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This API is evolving.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`DataStreamReader`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.readStream</span>
<span class="sd"> &lt;pyspark...DataStreamReader object ...&gt;</span>
<span class="sd"> The example below uses Rate source that generates rows continuously.</span>
<span class="sd"> After that, we operate a modulo by 3, and then write the stream out to the console.</span>
<span class="sd"> The streaming query stops in 3 seconds.</span>
<span class="sd"> &gt;&gt;&gt; import time</span>
<span class="sd"> &gt;&gt;&gt; df = spark.readStream.format(&quot;rate&quot;).load()</span>
<span class="sd"> &gt;&gt;&gt; df = df.selectExpr(&quot;value % 3 as v&quot;)</span>
<span class="sd"> &gt;&gt;&gt; q = df.writeStream.format(&quot;console&quot;).start()</span>
<span class="sd"> &gt;&gt;&gt; time.sleep(3)</span>
<span class="sd"> &gt;&gt;&gt; q.stop()</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="n">DataStreamReader</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">streams</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;StreamingQueryManager&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;Returns a :class:`StreamingQueryManager` that allows managing all the</span>
<span class="sd"> :class:`StreamingQuery` instances active on `this` context.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.5.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This API is evolving.</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`StreamingQueryManager`</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.streams</span>
<span class="sd"> &lt;pyspark...StreamingQueryManager object ...&gt;</span>
<span class="sd"> Get the list of active streaming queries</span>
<span class="sd"> &gt;&gt;&gt; sq = spark.readStream.format(</span>
<span class="sd"> ... &quot;rate&quot;).load().writeStream.format(&#39;memory&#39;).queryName(&#39;this_query&#39;).start()</span>
<span class="sd"> &gt;&gt;&gt; sqm = spark.streams</span>
<span class="sd"> &gt;&gt;&gt; [q.name for q in sqm.active]</span>
<span class="sd"> [&#39;this_query&#39;]</span>
<span class="sd"> &gt;&gt;&gt; sq.stop()</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.streaming</span> <span class="kn">import</span> <span class="n">StreamingQueryManager</span>
<span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">&quot;_sqm&quot;</span><span class="p">):</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sqm</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_sqm</span><span class="p">:</span> <span class="n">StreamingQueryManager</span> <span class="o">=</span> <span class="n">StreamingQueryManager</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">streams</span><span class="p">())</span>
<span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sqm</span>
<div class="viewcode-block" id="SparkSession.stop"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.stop.html#pyspark.sql.SparkSession.stop">[docs]</a> <span class="k">def</span> <span class="nf">stop</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Stop the underlying :class:`SparkContext`.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> .. versionchanged:: 3.4.0</span>
<span class="sd"> Supports Spark Connect.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; spark.stop() # doctest: +SKIP</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.context</span> <span class="kn">import</span> <span class="n">SQLContext</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
<span class="c1"># We should clean the default session up. See SPARK-23228.</span>
<span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">clearDefaultSession</span><span class="p">()</span>
<span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">clearActiveSession</span><span class="p">()</span>
<span class="n">SparkSession</span><span class="o">.</span><span class="n">_instantiatedSession</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">SparkSession</span><span class="o">.</span><span class="n">_activeSession</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">SQLContext</span><span class="o">.</span><span class="n">_instantiatedContext</span> <span class="o">=</span> <span class="kc">None</span></div>
<span class="k">def</span> <span class="fm">__enter__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SparkSession&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Enable &#39;with SparkSession.builder.(...).getOrCreate() as session: app&#39; syntax.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; with SparkSession.builder.master(&quot;local&quot;).getOrCreate() as session:</span>
<span class="sd"> ... session.range(5).show() # doctest: +SKIP</span>
<span class="sd"> +---+</span>
<span class="sd"> | id|</span>
<span class="sd"> +---+</span>
<span class="sd"> | 0|</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 2|</span>
<span class="sd"> | 3|</span>
<span class="sd"> | 4|</span>
<span class="sd"> +---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">return</span> <span class="bp">self</span>
<span class="k">def</span> <span class="fm">__exit__</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span>
<span class="n">exc_type</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Type</span><span class="p">[</span><span class="ne">BaseException</span><span class="p">]],</span>
<span class="n">exc_val</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="ne">BaseException</span><span class="p">],</span>
<span class="n">exc_tb</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">TracebackType</span><span class="p">],</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Enable &#39;with SparkSession.builder.(...).getOrCreate() as session: app&#39; syntax.</span>
<span class="sd"> Specifically stop the SparkSession on exit of the with block.</span>
<span class="sd"> .. versionadded:: 2.0.0</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; with SparkSession.builder.master(&quot;local&quot;).getOrCreate() as session:</span>
<span class="sd"> ... session.range(5).show() # doctest: +SKIP</span>
<span class="sd"> +---+</span>
<span class="sd"> | id|</span>
<span class="sd"> +---+</span>
<span class="sd"> | 0|</span>
<span class="sd"> | 1|</span>
<span class="sd"> | 2|</span>
<span class="sd"> | 3|</span>
<span class="sd"> | 4|</span>
<span class="sd"> +---+</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="bp">self</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
<span class="c1"># SparkConnect-specific API</span>
<span class="nd">@property</span>
<span class="k">def</span> <span class="nf">client</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="s2">&quot;SparkConnectClient&quot;</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Gives access to the Spark Connect client. In normal cases this is not necessary to be used</span>
<span class="sd"> and only relevant for testing.</span>
<span class="sd"> .. versionadded:: 3.4.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> :class:`SparkConnectClient`</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This API is unstable, and a developer API. It returns non-API instance</span>
<span class="sd"> :class:`SparkConnectClient`.</span>
<span class="sd"> This is an API dedicated to Spark Connect client only. With regular Spark Session, it throws</span>
<span class="sd"> an exception.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;ONLY_SUPPORTED_WITH_SPARK_CONNECT&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;SparkSession.client&quot;</span><span class="p">},</span>
<span class="p">)</span>
<div class="viewcode-block" id="SparkSession.addArtifacts"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.addArtifacts.html#pyspark.sql.SparkSession.addArtifacts">[docs]</a> <span class="k">def</span> <span class="nf">addArtifacts</span><span class="p">(</span>
<span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">pyfile</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">archive</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">file</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span>
<span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Add artifact(s) to the client session. Currently only local files are supported.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> *path : tuple of str</span>
<span class="sd"> Artifact&#39;s URIs to add.</span>
<span class="sd"> pyfile : bool</span>
<span class="sd"> Whether to add them as Python dependencies such as .py, .egg, .zip or .jar files.</span>
<span class="sd"> The pyfiles are directly inserted into the path when executing Python functions</span>
<span class="sd"> in executors.</span>
<span class="sd"> archive : bool</span>
<span class="sd"> Whether to add them as archives such as .zip, .jar, .tar.gz, .tgz, or .tar files.</span>
<span class="sd"> The archives are unpacked on the executor side automatically.</span>
<span class="sd"> file : bool</span>
<span class="sd"> Add a file to be downloaded with this Spark job on every node.</span>
<span class="sd"> The ``path`` passed can only be a local file for now.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This is an API dedicated to Spark Connect client only. With regular Spark Session, it throws</span>
<span class="sd"> an exception.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;ONLY_SUPPORTED_WITH_SPARK_CONNECT&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;SparkSession.addArtifact(s)&quot;</span><span class="p">},</span>
<span class="p">)</span></div>
<span class="n">addArtifact</span> <span class="o">=</span> <span class="n">addArtifacts</span>
<div class="viewcode-block" id="SparkSession.registerProgressHandler"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.registerProgressHandler.html#pyspark.sql.SparkSession.registerProgressHandler">[docs]</a> <span class="k">def</span> <span class="nf">registerProgressHandler</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">handler</span><span class="p">:</span> <span class="s2">&quot;ProgressHandler&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Register a progress handler to be called when a progress update is received from the server.</span>
<span class="sd"> .. versionadded:: 4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> handler : ProgressHandler</span>
<span class="sd"> A callable that follows the ProgressHandler interface. This handler will be called</span>
<span class="sd"> on every progress update.</span>
<span class="sd"> Examples</span>
<span class="sd"> --------</span>
<span class="sd"> &gt;&gt;&gt; def progress_handler(stages, inflight_tasks, done):</span>
<span class="sd"> ... print(f&quot;{len(stages)} Stages known, Done: {done}&quot;)</span>
<span class="sd"> &gt;&gt;&gt; spark.registerProgressHandler(progress_handler)</span>
<span class="sd"> &gt;&gt;&gt; res = spark.range(10).repartition(1).collect() # doctest: +SKIP</span>
<span class="sd"> 3 Stages known, Done: False</span>
<span class="sd"> 3 Stages known, Done: True</span>
<span class="sd"> &gt;&gt;&gt; spark.clearProgressHandlers()</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;ONLY_SUPPORTED_WITH_SPARK_CONNECT&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;SparkSession.registerProgressHandler&quot;</span><span class="p">},</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="SparkSession.removeProgressHandler"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.removeProgressHandler.html#pyspark.sql.SparkSession.removeProgressHandler">[docs]</a> <span class="k">def</span> <span class="nf">removeProgressHandler</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">handler</span><span class="p">:</span> <span class="s2">&quot;ProgressHandler&quot;</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Remove a progress handler that was previously registered.</span>
<span class="sd"> .. versionadded:: 4.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> handler : ProgressHandler</span>
<span class="sd"> The handler to remove if present in the list of progress handlers.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;ONLY_SUPPORTED_WITH_SPARK_CONNECT&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;SparkSession.removeProgressHandler&quot;</span><span class="p">},</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="SparkSession.clearProgressHandlers"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.clearProgressHandlers.html#pyspark.sql.SparkSession.clearProgressHandlers">[docs]</a> <span class="k">def</span> <span class="nf">clearProgressHandlers</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Clear all registered progress handlers.</span>
<span class="sd"> .. versionadded:: 4.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;ONLY_SUPPORTED_WITH_SPARK_CONNECT&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;SparkSession.clearProgressHandlers&quot;</span><span class="p">},</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="SparkSession.copyFromLocalToFs"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.copyFromLocalToFs.html#pyspark.sql.SparkSession.copyFromLocalToFs">[docs]</a> <span class="k">def</span> <span class="nf">copyFromLocalToFs</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">local_path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">dest_path</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Copy file from local to cloud storage file system.</span>
<span class="sd"> If the file already exits in destination path, old file is overwritten.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> local_path: str</span>
<span class="sd"> Path to a local file. Directories are not supported.</span>
<span class="sd"> The path can be either an absolute path or a relative path.</span>
<span class="sd"> dest_path: str</span>
<span class="sd"> The cloud storage path to the destination the file will</span>
<span class="sd"> be copied to.</span>
<span class="sd"> The path must be an an absolute path.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> This API is a developer API.</span>
<span class="sd"> Also, this is an API dedicated to Spark Connect client only. With regular</span>
<span class="sd"> Spark Session, it throws an exception.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;ONLY_SUPPORTED_WITH_SPARK_CONNECT&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;SparkSession.copyFromLocalToFs&quot;</span><span class="p">},</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="SparkSession.interruptAll"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.interruptAll.html#pyspark.sql.SparkSession.interruptAll">[docs]</a> <span class="k">def</span> <span class="nf">interruptAll</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Interrupt all operations of this session currently running on the connected server.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> list of str</span>
<span class="sd"> List of operationIds of interrupted operations.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> There is still a possibility of operation finishing just as it is interrupted.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;ONLY_SUPPORTED_WITH_SPARK_CONNECT&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;SparkSession.interruptAll&quot;</span><span class="p">},</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="SparkSession.interruptTag"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.interruptTag.html#pyspark.sql.SparkSession.interruptTag">[docs]</a> <span class="k">def</span> <span class="nf">interruptTag</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tag</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Interrupt all operations of this session with the given operation tag.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> list of str</span>
<span class="sd"> List of operationIds of interrupted operations.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> There is still a possibility of operation finishing just as it is interrupted.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;ONLY_SUPPORTED_WITH_SPARK_CONNECT&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;SparkSession.interruptTag&quot;</span><span class="p">},</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="SparkSession.interruptOperation"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.interruptOperation.html#pyspark.sql.SparkSession.interruptOperation">[docs]</a> <span class="k">def</span> <span class="nf">interruptOperation</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">op_id</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Interrupt an operation of this session with the given operationId.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> list of str</span>
<span class="sd"> List of operationIds of interrupted operations.</span>
<span class="sd"> Notes</span>
<span class="sd"> -----</span>
<span class="sd"> There is still a possibility of operation finishing just as it is interrupted.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;ONLY_SUPPORTED_WITH_SPARK_CONNECT&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;SparkSession.interruptOperation&quot;</span><span class="p">},</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="SparkSession.addTag"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.addTag.html#pyspark.sql.SparkSession.addTag">[docs]</a> <span class="k">def</span> <span class="nf">addTag</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tag</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Add a tag to be assigned to all the operations started by this thread in this session.</span>
<span class="sd"> Often, a unit of execution in an application consists of multiple Spark executions.</span>
<span class="sd"> Application programmers can use this method to group all those jobs together and give a</span>
<span class="sd"> group tag. The application can use :meth:`SparkSession.interruptTag` to cancel all running</span>
<span class="sd"> executions with this tag.</span>
<span class="sd"> There may be multiple tags present at the same time, so different parts of application may</span>
<span class="sd"> use different tags to perform cancellation at different levels of granularity.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> tag : str</span>
<span class="sd"> The tag to be added. Cannot contain &#39;,&#39; (comma) character or be an empty string.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;ONLY_SUPPORTED_WITH_SPARK_CONNECT&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;SparkSession.addTag&quot;</span><span class="p">},</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="SparkSession.removeTag"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.removeTag.html#pyspark.sql.SparkSession.removeTag">[docs]</a> <span class="k">def</span> <span class="nf">removeTag</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tag</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Remove a tag previously added to be assigned to all the operations started by this thread in</span>
<span class="sd"> this session. Noop if such a tag was not added earlier.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Parameters</span>
<span class="sd"> ----------</span>
<span class="sd"> tag : list of str</span>
<span class="sd"> The tag to be removed. Cannot contain &#39;,&#39; (comma) character or be an empty string.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;ONLY_SUPPORTED_WITH_SPARK_CONNECT&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;SparkSession.removeTag&quot;</span><span class="p">},</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="SparkSession.getTags"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.getTags.html#pyspark.sql.SparkSession.getTags">[docs]</a> <span class="k">def</span> <span class="nf">getTags</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Set</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Get the tags that are currently set to be assigned to all the operations started by this</span>
<span class="sd"> thread.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> Returns</span>
<span class="sd"> -------</span>
<span class="sd"> set of str</span>
<span class="sd"> Set of tags of interrupted operations.</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;ONLY_SUPPORTED_WITH_SPARK_CONNECT&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;SparkSession.getTags&quot;</span><span class="p">},</span>
<span class="p">)</span></div>
<div class="viewcode-block" id="SparkSession.clearTags"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.clearTags.html#pyspark.sql.SparkSession.clearTags">[docs]</a> <span class="k">def</span> <span class="nf">clearTags</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="w"> </span><span class="sd">&quot;&quot;&quot;</span>
<span class="sd"> Clear the current thread&#39;s operation tags.</span>
<span class="sd"> .. versionadded:: 3.5.0</span>
<span class="sd"> &quot;&quot;&quot;</span>
<span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span>
<span class="n">errorClass</span><span class="o">=</span><span class="s2">&quot;ONLY_SUPPORTED_WITH_SPARK_CONNECT&quot;</span><span class="p">,</span>
<span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;feature&quot;</span><span class="p">:</span> <span class="s2">&quot;SparkSession.clearTags&quot;</span><span class="p">},</span>
<span class="p">)</span></div></div>
<span class="k">def</span> <span class="nf">_test</span><span class="p">()</span> <span class="o">-&gt;</span> <span class="kc">None</span><span class="p">:</span>
<span class="kn">import</span> <span class="nn">os</span>
<span class="kn">import</span> <span class="nn">doctest</span>
<span class="kn">import</span> <span class="nn">pyspark.sql.session</span>
<span class="n">os</span><span class="o">.</span><span class="n">chdir</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">&quot;SPARK_HOME&quot;</span><span class="p">])</span>
<span class="c1"># Disable Doc Tests for Spark Connect only functions:</span>
<span class="n">pyspark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">session</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">registerProgressHandler</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">pyspark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">session</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">removeProgressHandler</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">pyspark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">session</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">clearProgressHandlers</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="kc">None</span>
<span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">session</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;spark&quot;</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span>
<span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">&quot;local[4]&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">&quot;sql.session tests&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="p">)</span>
<span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span>
<span class="n">pyspark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">session</span><span class="p">,</span>
<span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span>
<span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span><span class="p">,</span>
<span class="p">)</span>
<span class="n">globs</span><span class="p">[</span><span class="s2">&quot;spark&quot;</span><span class="p">]</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span>
<span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span>
<span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span>
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
<span class="n">_test</span><span class="p">()</span>
</pre></div>
</article>
<footer class="bd-footer-article">
<div class="footer-article-items footer-article__inner">
<div class="footer-article-item"><!-- Previous / next buttons -->
<div class="prev-next-area">
</div></div>
</div>
</footer>
</div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script>
<script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item"><p class="copyright">
Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
</p></div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item"><p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3.
</p></div>
</div>
</div>
</footer>
</body>
</html>