| |
| |
| <!DOCTYPE html> |
| |
| |
| <html > |
| |
| <head> |
| <meta charset="utf-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /> |
| <title>pyspark.sql.session — PySpark 4.0.0-preview2 documentation</title> |
| |
| |
| |
| <script data-cfasync="false"> |
| document.documentElement.dataset.mode = localStorage.getItem("mode") || ""; |
| document.documentElement.dataset.theme = localStorage.getItem("theme") || "light"; |
| </script> |
| |
| <!-- Loaded before other Sphinx assets --> |
| <link href="../../../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../../../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../../../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| |
| |
| <link href="../../../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" /> |
| |
| <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" /> |
| |
| <!-- Pre-loaded scripts that we'll load fully later --> |
| <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52" /> |
| <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" /> |
| |
| <script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script> |
| <script src="../../../_static/jquery.js"></script> |
| <script src="../../../_static/underscore.js"></script> |
| <script src="../../../_static/doctools.js"></script> |
| <script src="../../../_static/clipboard.min.js"></script> |
| <script src="../../../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script>DOCUMENTATION_OPTIONS.pagename = '_modules/pyspark/sql/session';</script> |
| <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/sql/session.html" /> |
| <link rel="search" title="Search" href="../../../search.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="None"> |
| |
| |
| <!-- Matomo --> |
| <script type="text/javascript"> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| </head> |
| |
| |
| <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode=""> |
| |
| |
| |
| <a class="skip-link" href="#main-content">Skip to main content</a> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__primary" |
| id="__primary"/> |
| <label class="overlay overlay-primary" for="__primary"></label> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__secondary" |
| id="__secondary"/> |
| <label class="overlay overlay-secondary" for="__secondary"></label> |
| |
| <div class="search-button__wrapper"> |
| <div class="search-button__overlay"></div> |
| <div class="search-button__search-container"> |
| <form class="bd-search d-flex align-items-center" |
| action="../../../search.html" |
| method="get"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| <input type="search" |
| class="form-control" |
| name="q" |
| id="search-input" |
| placeholder="Search the docs ..." |
| aria-label="Search the docs ..." |
| autocomplete="off" |
| autocorrect="off" |
| autocapitalize="off" |
| spellcheck="false"/> |
| <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span> |
| </form></div> |
| </div> |
| |
| <nav class="bd-header navbar navbar-expand-lg bd-navbar"> |
| <div class="bd-header__inner bd-page-width"> |
| <label class="sidebar-toggle primary-toggle" for="__primary"> |
| <span class="fa-solid fa-bars"></span> |
| </label> |
| |
| <div class="navbar-header-items__start"> |
| |
| <div class="navbar-item"> |
| |
| |
| <a class="navbar-brand logo" href="../../../index.html"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <img src="https://spark.apache.org/images/spark-logo.png" class="logo__image only-light" alt="Logo image"/> |
| <script>document.write(`<img src="https://spark.apache.org/images/spark-logo-rev.svg" class="logo__image only-dark" alt="Logo image"/>`);</script> |
| |
| |
| </a></div> |
| |
| </div> |
| |
| |
| <div class="col-lg-9 navbar-header-items"> |
| |
| <div class="me-auto navbar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| <div class="navbar-header-items__end"> |
| |
| <div class="navbar-item navbar-persistent--container"> |
| |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview2 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "_modules/pyspark/sql/session.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="navbar-persistent--mobile"> |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| |
| </div> |
| |
| </nav> |
| |
| <div class="bd-container"> |
| <div class="bd-container__inner bd-page-width"> |
| |
| <div class="bd-sidebar-primary bd-sidebar hide-on-wide"> |
| |
| |
| |
| <div class="sidebar-header-items sidebar-primary__section"> |
| |
| |
| <div class="sidebar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| |
| <div class="sidebar-header-items__end"> |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview2 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "_modules/pyspark/sql/session.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="sidebar-primary-items__end sidebar-primary__section"> |
| </div> |
| |
| <div id="rtd-footer-container"></div> |
| |
| |
| </div> |
| |
| <main id="main-content" class="bd-main"> |
| |
| |
| <div class="bd-content"> |
| <div class="bd-article-container"> |
| |
| <div class="bd-header-article"> |
| <div class="header-article-items header-article__inner"> |
| |
| <div class="header-article-items__start"> |
| |
| <div class="header-article-item"> |
| |
| |
| |
| <nav aria-label="Breadcrumbs"> |
| <ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb"> |
| |
| <li class="breadcrumb-item breadcrumb-home"> |
| <a href="../../../index.html" class="nav-link" aria-label="Home"> |
| <i class="fa-solid fa-home"></i> |
| </a> |
| </li> |
| |
| <li class="breadcrumb-item"><a href="../../index.html" class="nav-link">Module code</a></li> |
| |
| <li class="breadcrumb-item active" aria-current="page">pyspark.sql.session</li> |
| </ul> |
| </nav> |
| </div> |
| |
| </div> |
| |
| |
| </div> |
| </div> |
| |
| |
| |
| |
| <div id="searchbox"></div> |
| <article class="bd-article" role="main"> |
| |
| <h1>Source code for pyspark.sql.session</h1><div class="highlight"><pre> |
| <span></span><span class="c1">#</span> |
| <span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span> |
| <span class="c1"># contributor license agreements. See the NOTICE file distributed with</span> |
| <span class="c1"># this work for additional information regarding copyright ownership.</span> |
| <span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span> |
| <span class="c1"># (the "License"); you may not use this file except in compliance with</span> |
| <span class="c1"># the License. You may obtain a copy of the License at</span> |
| <span class="c1">#</span> |
| <span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span> |
| <span class="c1">#</span> |
| <span class="c1"># Unless required by applicable law or agreed to in writing, software</span> |
| <span class="c1"># distributed under the License is distributed on an "AS IS" BASIS,</span> |
| <span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span> |
| <span class="c1"># See the License for the specific language governing permissions and</span> |
| <span class="c1"># limitations under the License.</span> |
| <span class="c1">#</span> |
| <span class="kn">import</span> <span class="nn">os</span> |
| <span class="kn">import</span> <span class="nn">sys</span> |
| <span class="kn">import</span> <span class="nn">warnings</span> |
| <span class="kn">from</span> <span class="nn">collections.abc</span> <span class="kn">import</span> <span class="n">Sized</span> |
| <span class="kn">from</span> <span class="nn">functools</span> <span class="kn">import</span> <span class="n">reduce</span> |
| <span class="kn">from</span> <span class="nn">threading</span> <span class="kn">import</span> <span class="n">RLock</span> |
| <span class="kn">from</span> <span class="nn">types</span> <span class="kn">import</span> <span class="n">TracebackType</span> |
| <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">Any</span><span class="p">,</span> |
| <span class="n">ClassVar</span><span class="p">,</span> |
| <span class="n">Dict</span><span class="p">,</span> |
| <span class="n">Iterable</span><span class="p">,</span> |
| <span class="n">List</span><span class="p">,</span> |
| <span class="n">Optional</span><span class="p">,</span> |
| <span class="n">Tuple</span><span class="p">,</span> |
| <span class="n">Type</span><span class="p">,</span> |
| <span class="n">Union</span><span class="p">,</span> |
| <span class="n">Set</span><span class="p">,</span> |
| <span class="n">cast</span><span class="p">,</span> |
| <span class="n">no_type_check</span><span class="p">,</span> |
| <span class="n">overload</span><span class="p">,</span> |
| <span class="n">TYPE_CHECKING</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark.conf</span> <span class="kn">import</span> <span class="n">SparkConf</span> |
| <span class="kn">from</span> <span class="nn">pyspark.util</span> <span class="kn">import</span> <span class="n">is_remote_only</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.conf</span> <span class="kn">import</span> <span class="n">RuntimeConfig</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.dataframe</span> <span class="kn">import</span> <span class="n">DataFrame</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">lit</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.pandas.conversion</span> <span class="kn">import</span> <span class="n">SparkConversionMixin</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.profiler</span> <span class="kn">import</span> <span class="n">AccumulatorProfilerCollector</span><span class="p">,</span> <span class="n">Profile</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.readwriter</span> <span class="kn">import</span> <span class="n">DataFrameReader</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.sql_formatter</span> <span class="kn">import</span> <span class="n">SQLStringFormatter</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.streaming</span> <span class="kn">import</span> <span class="n">DataStreamReader</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">AtomicType</span><span class="p">,</span> |
| <span class="n">DataType</span><span class="p">,</span> |
| <span class="n">StructField</span><span class="p">,</span> |
| <span class="n">StructType</span><span class="p">,</span> |
| <span class="n">_make_type_verifier</span><span class="p">,</span> |
| <span class="n">_infer_schema</span><span class="p">,</span> |
| <span class="n">_has_nulltype</span><span class="p">,</span> |
| <span class="n">_merge_type</span><span class="p">,</span> |
| <span class="n">_create_converter</span><span class="p">,</span> |
| <span class="n">_parse_datatype_string</span><span class="p">,</span> |
| <span class="n">_from_numpy_type</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.errors.exceptions.captured</span> <span class="kn">import</span> <span class="n">install_exception_handler</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.utils</span> <span class="kn">import</span> <span class="n">is_timestamp_ntz_preferred</span><span class="p">,</span> <span class="n">to_str</span><span class="p">,</span> <span class="n">try_remote_session_classmethod</span> |
| <span class="kn">from</span> <span class="nn">pyspark.errors</span> <span class="kn">import</span> <span class="n">PySparkValueError</span><span class="p">,</span> <span class="n">PySparkTypeError</span><span class="p">,</span> <span class="n">PySparkRuntimeError</span> |
| |
| <span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">py4j.java_gateway</span> <span class="kn">import</span> <span class="n">JavaObject</span> |
| <span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span> |
| <span class="kn">from</span> <span class="nn">pyspark.core.context</span> <span class="kn">import</span> <span class="n">SparkContext</span> |
| <span class="kn">from</span> <span class="nn">pyspark.core.rdd</span> <span class="kn">import</span> <span class="n">RDD</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql._typing</span> <span class="kn">import</span> <span class="n">AtomicValue</span><span class="p">,</span> <span class="n">RowLike</span><span class="p">,</span> <span class="n">OptionalPrimitiveType</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.catalog</span> <span class="kn">import</span> <span class="n">Catalog</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.pandas._typing</span> <span class="kn">import</span> <span class="n">ArrayLike</span><span class="p">,</span> <span class="n">DataFrameLike</span> <span class="k">as</span> <span class="n">PandasDataFrameLike</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.streaming</span> <span class="kn">import</span> <span class="n">StreamingQueryManager</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.udf</span> <span class="kn">import</span> <span class="n">UDFRegistration</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.udtf</span> <span class="kn">import</span> <span class="n">UDTFRegistration</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.datasource</span> <span class="kn">import</span> <span class="n">DataSourceRegistration</span> |
| |
| <span class="c1"># Running MyPy type checks will always require pandas and</span> |
| <span class="c1"># other dependencies so importing here is fine.</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.connect.client</span> <span class="kn">import</span> <span class="n">SparkConnectClient</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.connect.shell.progress</span> <span class="kn">import</span> <span class="n">ProgressHandler</span> |
| |
| <span class="k">try</span><span class="p">:</span> |
| <span class="kn">import</span> <span class="nn">memory_profiler</span> <span class="c1"># noqa: F401</span> |
| |
| <span class="n">has_memory_profiler</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="k">except</span> <span class="ne">Exception</span><span class="p">:</span> |
| <span class="n">has_memory_profiler</span> <span class="o">=</span> <span class="kc">False</span> |
| |
| <span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"SparkSession"</span><span class="p">]</span> |
| |
| |
| <span class="k">def</span> <span class="nf">_monkey_patch_RDD</span><span class="p">(</span><span class="n">sparkSession</span><span class="p">:</span> <span class="s2">"SparkSession"</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="nd">@no_type_check</span> |
| <span class="k">def</span> <span class="nf">toDF</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">schema</span><span class="o">=</span><span class="kc">None</span><span class="p">,</span> <span class="n">sampleRatio</span><span class="o">=</span><span class="kc">None</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Converts current :class:`RDD` into a :class:`DataFrame`</span> |
| |
| <span class="sd"> This is a shorthand for ``spark.createDataFrame(rdd, schema, sampleRatio)``</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> schema : :class:`pyspark.sql.types.DataType`, str or list, optional</span> |
| <span class="sd"> a :class:`pyspark.sql.types.DataType` or a datatype string or a list of</span> |
| <span class="sd"> column names, default is None. The data type string format equals to</span> |
| <span class="sd"> :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can</span> |
| <span class="sd"> omit the ``struct<>`` and atomic types use ``typeName()`` as their format, e.g. use</span> |
| <span class="sd"> ``byte`` instead of ``tinyint`` for :class:`pyspark.sql.types.ByteType`.</span> |
| <span class="sd"> We can also use ``int`` as a short name for :class:`pyspark.sql.types.IntegerType`.</span> |
| <span class="sd"> sampleRatio : float, optional</span> |
| <span class="sd"> the sample ratio of rows used for inferring</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> rdd = spark.range(1).rdd.map(lambda x: tuple(x))</span> |
| <span class="sd"> >>> rdd.collect()</span> |
| <span class="sd"> [(0,)]</span> |
| <span class="sd"> >>> rdd.toDF().show()</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | _1|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">sparkSession</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">schema</span><span class="p">,</span> <span class="n">sampleRatio</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_remote_only</span><span class="p">():</span> |
| <span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">RDD</span> |
| |
| <span class="n">RDD</span><span class="o">.</span><span class="n">toDF</span> <span class="o">=</span> <span class="n">toDF</span> <span class="c1"># type: ignore[method-assign]</span> |
| |
| |
| <span class="c1"># TODO(SPARK-38912): This method can be dropped once support for Python 3.8 is dropped</span> |
| <span class="c1"># In Python 3.9, the @property decorator has been made compatible with the</span> |
| <span class="c1"># @classmethod decorator (https://docs.python.org/3.9/library/functions.html#classmethod)</span> |
| <span class="c1">#</span> |
| <span class="c1"># @classmethod + @property is also affected by a bug in Python's docstring which was backported</span> |
| <span class="c1"># to Python 3.9.6 (https://github.com/python/cpython/pull/28838)</span> |
| <span class="c1">#</span> |
| <span class="c1"># Python 3.9 with MyPy complains about @classmethod + @property combination. We should fix</span> |
| <span class="c1"># it together with MyPy.</span> |
| <span class="k">class</span> <span class="nc">classproperty</span><span class="p">(</span><span class="nb">property</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""Same as Python's @property decorator, but for class attributes.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> class Builder:</span> |
| <span class="sd"> ... def build(self):</span> |
| <span class="sd"> ... return MyClass()</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> class MyClass:</span> |
| <span class="sd"> ... @classproperty</span> |
| <span class="sd"> ... def builder(cls):</span> |
| <span class="sd"> ... print("instantiating new builder")</span> |
| <span class="sd"> ... return Builder()</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> c1 = MyClass.builder</span> |
| <span class="sd"> instantiating new builder</span> |
| <span class="sd"> >>> c2 = MyClass.builder</span> |
| <span class="sd"> instantiating new builder</span> |
| <span class="sd"> >>> c1 == c2</span> |
| <span class="sd"> False</span> |
| <span class="sd"> >>> isinstance(c1.build(), MyClass)</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="fm">__get__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">instance</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="n">owner</span><span class="p">:</span> <span class="n">Any</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"SparkSession.Builder"</span><span class="p">:</span> |
| <span class="c1"># The "type: ignore" below silences the following error from mypy:</span> |
| <span class="c1"># error: Argument 1 to "classmethod" has incompatible</span> |
| <span class="c1"># type "Optional[Callable[[Any], Any]]";</span> |
| <span class="c1"># expected "Callable[..., Any]" [arg-type]</span> |
| <span class="k">return</span> <span class="nb">classmethod</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">fget</span><span class="p">)</span><span class="o">.</span><span class="fm">__get__</span><span class="p">(</span><span class="kc">None</span><span class="p">,</span> <span class="n">owner</span><span class="p">)()</span> <span class="c1"># type: ignore</span> |
| |
| |
| <div class="viewcode-block" id="SparkSession"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.html#pyspark.sql.SparkSession">[docs]</a><span class="k">class</span> <span class="nc">SparkSession</span><span class="p">(</span><span class="n">SparkConversionMixin</span><span class="p">):</span> |
| <span class="w"> </span><span class="sd">"""The entry point to programming Spark with the Dataset and DataFrame API.</span> |
| |
| <span class="sd"> A SparkSession can be used to create :class:`DataFrame`, register :class:`DataFrame` as</span> |
| <span class="sd"> tables, execute SQL over tables, cache tables, and read parquet files.</span> |
| <span class="sd"> To create a :class:`SparkSession`, use the following builder pattern:</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> .. autoattribute:: builder</span> |
| <span class="sd"> :annotation:</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Create a Spark session.</span> |
| |
| <span class="sd"> >>> spark = (</span> |
| <span class="sd"> ... SparkSession.builder</span> |
| <span class="sd"> ... .master("local")</span> |
| <span class="sd"> ... .appName("Word Count")</span> |
| <span class="sd"> ... .config("spark.some.config.option", "some-value")</span> |
| <span class="sd"> ... .getOrCreate()</span> |
| <span class="sd"> ... )</span> |
| |
| <span class="sd"> Create a Spark session with Spark Connect.</span> |
| |
| <span class="sd"> >>> spark = (</span> |
| <span class="sd"> ... SparkSession.builder</span> |
| <span class="sd"> ... .remote("sc://localhost")</span> |
| <span class="sd"> ... .appName("Word Count")</span> |
| <span class="sd"> ... .config("spark.some.config.option", "some-value")</span> |
| <span class="sd"> ... .getOrCreate()</span> |
| <span class="sd"> ... ) # doctest: +SKIP</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">class</span> <span class="nc">Builder</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Builder for :class:`SparkSession`."""</span> |
| |
| <span class="n">_lock</span> <span class="o">=</span> <span class="n">RLock</span><span class="p">()</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="p">{}</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">config</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="n">conf</span><span class="p">:</span> <span class="n">SparkConf</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"SparkSession.Builder"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">config</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">key</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"SparkSession.Builder"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">config</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="p">,</span> <span class="nb">map</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="s2">"OptionalPrimitiveType"</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"SparkSession.Builder"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="k">def</span> <span class="nf">config</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">key</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">conf</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">SparkConf</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="nb">map</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="s2">"OptionalPrimitiveType"</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"SparkSession.Builder"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Sets a config option. Options set using this method are automatically propagated to</span> |
| <span class="sd"> both :class:`SparkConf` and :class:`SparkSession`'s own configuration.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> key : str, optional</span> |
| <span class="sd"> a key name string for configuration property</span> |
| <span class="sd"> value : str, optional</span> |
| <span class="sd"> a value for configuration property</span> |
| <span class="sd"> conf : :class:`SparkConf`, optional</span> |
| <span class="sd"> an instance of :class:`SparkConf`</span> |
| <span class="sd"> map: dictionary, optional</span> |
| <span class="sd"> a dictionary of configurations to set</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`SparkSession.Builder`</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> :class:`SparkConf`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> For an existing :class:`SparkConf`, use `conf` parameter.</span> |
| |
| <span class="sd"> >>> from pyspark.conf import SparkConf</span> |
| <span class="sd"> >>> conf = SparkConf().setAppName("example").setMaster("local")</span> |
| <span class="sd"> >>> SparkSession.builder.config(conf=conf)</span> |
| <span class="sd"> <pyspark.sql.session.SparkSession.Builder...</span> |
| |
| <span class="sd"> For a (key, value) pair, you can omit parameter names.</span> |
| |
| <span class="sd"> >>> SparkSession.builder.config("spark.some.config.option", "some-value")</span> |
| <span class="sd"> <pyspark.sql.session.SparkSession.Builder...</span> |
| |
| <span class="sd"> Set multiple configurations.</span> |
| |
| <span class="sd"> >>> SparkSession.builder.config(</span> |
| <span class="sd"> ... "spark.some.config.number", 123).config("spark.some.config.float", 0.123)</span> |
| <span class="sd"> <pyspark.sql.session.SparkSession.Builder...</span> |
| |
| <span class="sd"> Set multiple configurations using a dictionary.</span> |
| |
| <span class="sd"> >>> SparkSession.builder.config(</span> |
| <span class="sd"> ... map={"spark.some.config.number": 123, "spark.some.config.float": 0.123})</span> |
| <span class="sd"> <pyspark.sql.session.SparkSession.Builder...</span> |
| <span class="sd"> """</span> |
| <span class="k">with</span> <span class="bp">self</span><span class="o">.</span><span class="n">_lock</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">conf</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">conf</span><span class="o">.</span><span class="n">getAll</span><span class="p">():</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="n">v</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_validate_startup_urls</span><span class="p">()</span> |
| <span class="k">elif</span> <span class="nb">map</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="nb">map</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> <span class="c1"># type: ignore[assignment]</span> |
| <span class="n">v</span> <span class="o">=</span> <span class="n">to_str</span><span class="p">(</span><span class="n">v</span><span class="p">)</span> <span class="c1"># type: ignore[assignment]</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="p">[</span><span class="n">k</span><span class="p">]</span> <span class="o">=</span> <span class="n">v</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_validate_startup_urls</span><span class="p">()</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">value</span> <span class="o">=</span> <span class="n">to_str</span><span class="p">(</span><span class="n">value</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="p">[</span><span class="n">cast</span><span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="n">key</span><span class="p">)]</span> <span class="o">=</span> <span class="n">value</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_validate_startup_urls</span><span class="p">()</span> |
| <span class="k">return</span> <span class="bp">self</span> |
| |
| <span class="k">def</span> <span class="nf">_validate_startup_urls</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Helper function that validates the combination of startup URLs and raises an exception</span> |
| <span class="sd"> if incompatible options are selected.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="p">(</span><span class="s2">"spark.master"</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_options</span> <span class="ow">or</span> <span class="s2">"MASTER"</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">)</span> <span class="ow">and</span> <span class="p">(</span> |
| <span class="s2">"spark.remote"</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_options</span> <span class="ow">or</span> <span class="s2">"SPARK_REMOTE"</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span> |
| <span class="p">):</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"CANNOT_CONFIGURE_SPARK_CONNECT_MASTER"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span> |
| <span class="s2">"master_url"</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"spark.master"</span><span class="p">,</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"MASTER"</span><span class="p">)),</span> |
| <span class="s2">"connect_url"</span><span class="p">:</span> <span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span> |
| <span class="s2">"spark.remote"</span><span class="p">,</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"SPARK_REMOTE"</span><span class="p">)</span> |
| <span class="p">),</span> |
| <span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="s2">"spark.remote"</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="p">:</span> |
| <span class="n">remote</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="nb">str</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"spark.remote"</span><span class="p">))</span> |
| <span class="k">if</span> <span class="p">(</span><span class="s2">"SPARK_REMOTE"</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span> <span class="ow">and</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">"SPARK_REMOTE"</span><span class="p">]</span> <span class="o">!=</span> <span class="n">remote</span><span class="p">)</span> <span class="ow">and</span> <span class="p">(</span> |
| <span class="s2">"SPARK_LOCAL_REMOTE"</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span> <span class="ow">and</span> <span class="ow">not</span> <span class="n">remote</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">"local"</span><span class="p">)</span> |
| <span class="p">):</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"CANNOT_CONFIGURE_SPARK_CONNECT"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span> |
| <span class="s2">"existing_url"</span><span class="p">:</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">"SPARK_REMOTE"</span><span class="p">],</span> |
| <span class="s2">"new_url"</span><span class="p">:</span> <span class="n">remote</span><span class="p">,</span> |
| <span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">master</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">master</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"SparkSession.Builder"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Sets the Spark master URL to connect to, such as "local" to run locally, "local[4]"</span> |
| <span class="sd"> to run locally with 4 cores, or "spark://master:7077" to run on a Spark standalone</span> |
| <span class="sd"> cluster.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> master : str</span> |
| <span class="sd"> a url for spark master</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`SparkSession.Builder`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> SparkSession.builder.master("local")</span> |
| <span class="sd"> <pyspark.sql.session.SparkSession.Builder...</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="s2">"spark.master"</span><span class="p">,</span> <span class="n">master</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">remote</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">url</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"SparkSession.Builder"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Sets the Spark remote URL to connect to, such as "sc://host:port" to run</span> |
| <span class="sd"> it via Spark Connect server.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> url : str</span> |
| <span class="sd"> URL to Spark Connect server</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`SparkSession.Builder`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> SparkSession.builder.remote("sc://localhost") # doctest: +SKIP</span> |
| <span class="sd"> <pyspark.sql.session.SparkSession.Builder...</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="s2">"spark.remote"</span><span class="p">,</span> <span class="n">url</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">appName</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"SparkSession.Builder"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Sets a name for the application, which will be shown in the Spark web UI.</span> |
| |
| <span class="sd"> If no application name is set, a randomly generated name will be used.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> name : str</span> |
| <span class="sd"> an application name</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`SparkSession.Builder`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> SparkSession.builder.appName("My app")</span> |
| <span class="sd"> <pyspark.sql.session.SparkSession.Builder...</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="s2">"spark.app.name"</span><span class="p">,</span> <span class="n">name</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">enableHiveSupport</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"SparkSession.Builder"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Enables Hive support, including connectivity to a persistent Hive metastore, support</span> |
| <span class="sd"> for Hive SerDes, and Hive user-defined functions.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`SparkSession.Builder`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> SparkSession.builder.enableHiveSupport()</span> |
| <span class="sd"> <pyspark.sql.session.SparkSession.Builder...</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="s2">"spark.sql.catalogImplementation"</span><span class="p">,</span> <span class="s2">"hive"</span><span class="p">)</span> |
| |
| <span class="k">def</span> <span class="nf">getOrCreate</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"SparkSession"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Gets an existing :class:`SparkSession` or, if there is no existing one, creates a</span> |
| <span class="sd"> new one based on the options set in this builder.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`SparkSession`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> This method first checks whether there is a valid global default SparkSession, and if</span> |
| <span class="sd"> yes, return that one. If no valid global default SparkSession exists, the method</span> |
| <span class="sd"> creates a new SparkSession and assigns the newly created SparkSession as the global</span> |
| <span class="sd"> default.</span> |
| |
| <span class="sd"> >>> s1 = SparkSession.builder.config("k1", "v1").getOrCreate()</span> |
| <span class="sd"> >>> s1.conf.get("k1") == "v1"</span> |
| <span class="sd"> True</span> |
| |
| <span class="sd"> The configuration of the SparkSession can be changed afterwards</span> |
| |
| <span class="sd"> >>> s1.conf.set("k1", "v1_new")</span> |
| <span class="sd"> >>> s1.conf.get("k1") == "v1_new"</span> |
| <span class="sd"> True</span> |
| |
| <span class="sd"> In case an existing SparkSession is returned, the config options specified</span> |
| <span class="sd"> in this builder will be applied to the existing SparkSession.</span> |
| |
| <span class="sd"> >>> s2 = SparkSession.builder.config("k2", "v2").getOrCreate()</span> |
| <span class="sd"> >>> s1.conf.get("k1") == s2.conf.get("k1") == "v1_new"</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> s1.conf.get("k2") == s2.conf.get("k2") == "v2"</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| <span class="n">opts</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">is_remote_only</span><span class="p">():</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.connect.session</span> <span class="kn">import</span> <span class="n">SparkSession</span> <span class="k">as</span> <span class="n">RemoteSparkSession</span> |
| |
| <span class="n">url</span> <span class="o">=</span> <span class="n">opts</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"spark.remote"</span><span class="p">,</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"SPARK_REMOTE"</span><span class="p">))</span> |
| |
| <span class="k">if</span> <span class="n">url</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"CONNECT_URL_NOT_SET"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{},</span> |
| <span class="p">)</span> |
| |
| <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">"SPARK_CONNECT_MODE_ENABLED"</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"1"</span> |
| <span class="n">opts</span><span class="p">[</span><span class="s2">"spark.remote"</span><span class="p">]</span> <span class="o">=</span> <span class="n">url</span> |
| <span class="k">return</span> <span class="n">RemoteSparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="nb">map</span><span class="o">=</span><span class="n">opts</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> <span class="c1"># type: ignore</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark.core.context</span> <span class="kn">import</span> <span class="n">SparkContext</span> |
| |
| <span class="k">with</span> <span class="bp">self</span><span class="o">.</span><span class="n">_lock</span><span class="p">:</span> |
| <span class="k">if</span> <span class="p">(</span> |
| <span class="s2">"SPARK_CONNECT_MODE_ENABLED"</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span> |
| <span class="ow">or</span> <span class="s2">"SPARK_REMOTE"</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span> |
| <span class="ow">or</span> <span class="s2">"spark.remote"</span> <span class="ow">in</span> <span class="n">opts</span> |
| <span class="p">):</span> |
| <span class="k">with</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_lock</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.connect.session</span> <span class="kn">import</span> <span class="n">SparkSession</span> <span class="k">as</span> <span class="n">RemoteSparkSession</span> |
| |
| <span class="k">if</span> <span class="p">(</span> |
| <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="ow">and</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">_instantiatedSession</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="p">):</span> |
| <span class="n">url</span> <span class="o">=</span> <span class="n">opts</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"spark.remote"</span><span class="p">,</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"SPARK_REMOTE"</span><span class="p">))</span> |
| |
| <span class="k">if</span> <span class="n">url</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"CONNECT_URL_NOT_SET"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{},</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">url</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">"local"</span><span class="p">):</span> |
| <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">"SPARK_LOCAL_REMOTE"</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"1"</span> |
| <span class="n">RemoteSparkSession</span><span class="o">.</span><span class="n">_start_connect_server</span><span class="p">(</span><span class="n">url</span><span class="p">,</span> <span class="n">opts</span><span class="p">)</span> |
| <span class="n">url</span> <span class="o">=</span> <span class="s2">"sc://localhost"</span> |
| |
| <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">"SPARK_CONNECT_MODE_ENABLED"</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"1"</span> |
| <span class="n">opts</span><span class="p">[</span><span class="s2">"spark.remote"</span><span class="p">]</span> <span class="o">=</span> <span class="n">url</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span> |
| <span class="n">SparkSession</span><span class="p">,</span> |
| <span class="n">RemoteSparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="nb">map</span><span class="o">=</span><span class="n">opts</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">(),</span> |
| <span class="p">)</span> |
| <span class="k">elif</span> <span class="s2">"SPARK_LOCAL_REMOTE"</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">:</span> |
| <span class="n">url</span> <span class="o">=</span> <span class="s2">"sc://localhost"</span> |
| <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">"SPARK_CONNECT_MODE_ENABLED"</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"1"</span> |
| <span class="n">opts</span><span class="p">[</span><span class="s2">"spark.remote"</span><span class="p">]</span> <span class="o">=</span> <span class="n">url</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span> |
| <span class="n">SparkSession</span><span class="p">,</span> |
| <span class="n">RemoteSparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="nb">map</span><span class="o">=</span><span class="n">opts</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">(),</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"SESSION_ALREADY_EXIST"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{},</span> |
| <span class="p">)</span> |
| |
| <span class="n">session</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">_instantiatedSession</span> |
| <span class="k">if</span> <span class="n">session</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="n">session</span><span class="o">.</span><span class="n">_sc</span><span class="o">.</span><span class="n">_jsc</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">sparkConf</span> <span class="o">=</span> <span class="n">SparkConf</span><span class="p">()</span> |
| <span class="k">for</span> <span class="n">key</span><span class="p">,</span> <span class="n">value</span> <span class="ow">in</span> <span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> |
| <span class="n">sparkConf</span><span class="o">.</span><span class="n">set</span><span class="p">(</span><span class="n">key</span><span class="p">,</span> <span class="n">value</span><span class="p">)</span> |
| <span class="c1"># This SparkContext may be an existing one.</span> |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">(</span><span class="n">sparkConf</span><span class="p">)</span> |
| <span class="c1"># Do not update `SparkConf` for existing `SparkContext`, as it's shared</span> |
| <span class="c1"># by all sessions.</span> |
| <span class="n">session</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">options</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="nb">getattr</span><span class="p">(</span> |
| <span class="nb">getattr</span><span class="p">(</span><span class="n">session</span><span class="o">.</span><span class="n">_jvm</span><span class="p">,</span> <span class="s2">"SparkSession$"</span><span class="p">),</span> <span class="s2">"MODULE$"</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">applyModifiableSettings</span><span class="p">(</span><span class="n">session</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">session</span> |
| |
| <span class="c1"># Spark Connect-specific API</span> |
| <span class="k">def</span> <span class="nf">create</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"SparkSession"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Creates a new SparkSession. Can only be used in the context of Spark Connect</span> |
| <span class="sd"> and will throw an exception otherwise.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`SparkSession`</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This method will update the default and/or active session if they are not set.</span> |
| <span class="sd"> """</span> |
| <span class="n">opts</span> <span class="o">=</span> <span class="nb">dict</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_options</span><span class="p">)</span> |
| <span class="k">if</span> <span class="s2">"SPARK_REMOTE"</span> <span class="ow">in</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span> <span class="ow">or</span> <span class="s2">"spark.remote"</span> <span class="ow">in</span> <span class="n">opts</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.connect.session</span> <span class="kn">import</span> <span class="n">SparkSession</span> <span class="k">as</span> <span class="n">RemoteSparkSession</span> |
| |
| <span class="c1"># Validate that no incompatible configuration options are selected.</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_validate_startup_urls</span><span class="p">()</span> |
| |
| <span class="n">url</span> <span class="o">=</span> <span class="n">opts</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"spark.remote"</span><span class="p">,</span> <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"SPARK_REMOTE"</span><span class="p">))</span> |
| <span class="k">if</span> <span class="n">url</span><span class="o">.</span><span class="n">startswith</span><span class="p">(</span><span class="s2">"local"</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"UNSUPPORTED_LOCAL_CONNECTION_STRING"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{},</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># Mark this Spark Session as Spark Connect. This prevents that local PySpark is</span> |
| <span class="c1"># used in conjunction with Spark Connect mode.</span> |
| <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">"SPARK_CONNECT_MODE_ENABLED"</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"1"</span> |
| <span class="n">opts</span><span class="p">[</span><span class="s2">"spark.remote"</span><span class="p">]</span> <span class="o">=</span> <span class="n">url</span> |
| <span class="k">return</span> <span class="n">cast</span><span class="p">(</span><span class="n">SparkSession</span><span class="p">,</span> <span class="n">RemoteSparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="nb">map</span><span class="o">=</span><span class="n">opts</span><span class="p">)</span><span class="o">.</span><span class="n">create</span><span class="p">())</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"ONLY_SUPPORTED_WITH_SPARK_CONNECT"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"SparkSession.builder.create"</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># TODO(SPARK-38912): Replace classproperty with @classmethod + @property once support for</span> |
| <span class="c1"># Python 3.8 is dropped.</span> |
| <span class="c1">#</span> |
| <span class="c1"># In Python 3.9, the @property decorator has been made compatible with the</span> |
| <span class="c1"># @classmethod decorator (https://docs.python.org/3.9/library/functions.html#classmethod)</span> |
| <span class="c1">#</span> |
| <span class="c1"># @classmethod + @property is also affected by a bug in Python's docstring which was backported</span> |
| <span class="c1"># to Python 3.9.6 (https://github.com/python/cpython/pull/28838)</span> |
| <span class="c1">#</span> |
| <span class="c1"># SPARK-47544: Explicitly declaring this as an identifier instead of a method.</span> |
| <span class="c1"># If changing, make sure this bug is not reintroduced.</span> |
| <span class="n">builder</span><span class="p">:</span> <span class="n">Builder</span> <span class="o">=</span> <span class="n">classproperty</span><span class="p">(</span><span class="k">lambda</span> <span class="bp">cls</span><span class="p">:</span> <span class="bp">cls</span><span class="o">.</span><span class="n">Builder</span><span class="p">())</span> <span class="c1"># type: ignore</span> |
| <span class="w"> </span><span class="sd">"""Creates a :class:`Builder` for constructing a :class:`SparkSession`.</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| <span class="sd"> """</span> |
| |
| <span class="n">_instantiatedSession</span><span class="p">:</span> <span class="n">ClassVar</span><span class="p">[</span><span class="n">Optional</span><span class="p">[</span><span class="s2">"SparkSession"</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="n">_activeSession</span><span class="p">:</span> <span class="n">ClassVar</span><span class="p">[</span><span class="n">Optional</span><span class="p">[</span><span class="s2">"SparkSession"</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">sparkContext</span><span class="p">:</span> <span class="s2">"SparkContext"</span><span class="p">,</span> |
| <span class="n">jsparkSession</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"JavaObject"</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">options</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">]</span> <span class="o">=</span> <span class="p">{},</span> |
| <span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_sc</span> <span class="o">=</span> <span class="n">sparkContext</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jsc</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="o">.</span><span class="n">_jsc</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="o">.</span><span class="n">_jvm</span> |
| |
| <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| |
| <span class="k">if</span> <span class="n">jsparkSession</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">if</span> <span class="p">(</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">getDefaultSession</span><span class="p">()</span><span class="o">.</span><span class="n">isDefined</span><span class="p">()</span> |
| <span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">getDefaultSession</span><span class="p">()</span><span class="o">.</span><span class="n">get</span><span class="p">()</span><span class="o">.</span><span class="n">sparkContext</span><span class="p">()</span><span class="o">.</span><span class="n">isStopped</span><span class="p">()</span> |
| <span class="p">):</span> |
| <span class="n">jsparkSession</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">getDefaultSession</span><span class="p">()</span><span class="o">.</span><span class="n">get</span><span class="p">()</span> |
| <span class="nb">getattr</span><span class="p">(</span><span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="p">,</span> <span class="s2">"SparkSession$"</span><span class="p">),</span> <span class="s2">"MODULE$"</span><span class="p">)</span><span class="o">.</span><span class="n">applyModifiableSettings</span><span class="p">(</span> |
| <span class="n">jsparkSession</span><span class="p">,</span> <span class="n">options</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">jsparkSession</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jsc</span><span class="o">.</span><span class="n">sc</span><span class="p">(),</span> <span class="n">options</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="nb">getattr</span><span class="p">(</span><span class="nb">getattr</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="p">,</span> <span class="s2">"SparkSession$"</span><span class="p">),</span> <span class="s2">"MODULE$"</span><span class="p">)</span><span class="o">.</span><span class="n">applyModifiableSettings</span><span class="p">(</span> |
| <span class="n">jsparkSession</span><span class="p">,</span> <span class="n">options</span> |
| <span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span> <span class="o">=</span> <span class="n">jsparkSession</span> |
| <span class="n">_monkey_patch_RDD</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| <span class="n">install_exception_handler</span><span class="p">()</span> |
| <span class="c1"># If we had an instantiated SparkSession attached with a SparkContext</span> |
| <span class="c1"># which is stopped now, we need to renew the instantiated SparkSession.</span> |
| <span class="c1"># Otherwise, we will use invalid SparkSession when we call Builder.getOrCreate.</span> |
| <span class="k">if</span> <span class="p">(</span> |
| <span class="n">SparkSession</span><span class="o">.</span><span class="n">_instantiatedSession</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="ow">or</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">_instantiatedSession</span><span class="o">.</span><span class="n">_sc</span><span class="o">.</span><span class="n">_jsc</span> <span class="ow">is</span> <span class="kc">None</span> |
| <span class="p">):</span> |
| <span class="n">SparkSession</span><span class="o">.</span><span class="n">_instantiatedSession</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="n">SparkSession</span><span class="o">.</span><span class="n">_activeSession</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">setDefaultSession</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="p">)</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">setActiveSession</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="p">)</span> |
| |
| <span class="bp">self</span><span class="o">.</span><span class="n">_profiler_collector</span> <span class="o">=</span> <span class="n">AccumulatorProfilerCollector</span><span class="p">()</span> |
| |
| <span class="k">def</span> <span class="nf">_repr_html_</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="k">return</span> <span class="s2">"""</span> |
| <span class="s2"> <div></span> |
| <span class="s2"> <p><b>SparkSession - </span><span class="si">{catalogImplementation}</span><span class="s2"></b></p></span> |
| <span class="s2"> </span><span class="si">{sc_HTML}</span> |
| <span class="s2"> </div></span> |
| <span class="s2"> """</span><span class="o">.</span><span class="n">format</span><span class="p">(</span> |
| <span class="n">catalogImplementation</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">conf</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"spark.sql.catalogImplementation"</span><span class="p">),</span> |
| <span class="n">sc_HTML</span><span class="o">=</span><span class="bp">self</span><span class="o">.</span><span class="n">sparkContext</span><span class="o">.</span><span class="n">_repr_html_</span><span class="p">(),</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">_jconf</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"JavaObject"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Accessor for the JVM SQL-specific configurations"""</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">sessionState</span><span class="p">()</span><span class="o">.</span><span class="n">conf</span><span class="p">()</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_remote_only</span><span class="p">():</span> |
| |
| <div class="viewcode-block" id="SparkSession.newSession"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.newSession.html#pyspark.sql.SparkSession.newSession">[docs]</a> <span class="k">def</span> <span class="nf">newSession</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"SparkSession"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a new :class:`SparkSession` as new session, that has separate SQLConf,</span> |
| <span class="sd"> registered temporary views and UDFs, but shared :class:`SparkContext` and</span> |
| <span class="sd"> table cache.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`SparkSession`</span> |
| <span class="sd"> Spark session if an active session exists for the current thread</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.newSession()</span> |
| <span class="sd"> <...SparkSession object ...></span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="vm">__class__</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">newSession</span><span class="p">())</span></div> |
| |
| <div class="viewcode-block" id="SparkSession.getActiveSession"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.getActiveSession.html#pyspark.sql.SparkSession.getActiveSession">[docs]</a> <span class="nd">@classmethod</span> |
| <span class="nd">@try_remote_session_classmethod</span> |
| <span class="k">def</span> <span class="nf">getActiveSession</span><span class="p">(</span><span class="bp">cls</span><span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"SparkSession"</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the active :class:`SparkSession` for the current thread, returned by the builder</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.5.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`SparkSession`</span> |
| <span class="sd"> Spark session if an active session exists for the current thread</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> s = SparkSession.getActiveSession()</span> |
| <span class="sd"> >>> df = s.createDataFrame([('Alice', 1)], ['name', 'age'])</span> |
| <span class="sd"> >>> df.select("age").show()</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> |age|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">SparkContext</span> |
| |
| <span class="n">sc</span> <span class="o">=</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_active_spark_context</span> |
| <span class="k">if</span> <span class="n">sc</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">return</span> <span class="kc">None</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">assert</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| <span class="k">if</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">getActiveSession</span><span class="p">()</span><span class="o">.</span><span class="n">isDefined</span><span class="p">():</span> |
| <span class="n">SparkSession</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="n">sc</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">getActiveSession</span><span class="p">()</span><span class="o">.</span><span class="n">get</span><span class="p">())</span> |
| <span class="k">return</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">_activeSession</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="kc">None</span></div> |
| |
| <div class="viewcode-block" id="SparkSession.active"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.active.html#pyspark.sql.SparkSession.active">[docs]</a> <span class="nd">@classmethod</span> |
| <span class="nd">@try_remote_session_classmethod</span> |
| <span class="k">def</span> <span class="nf">active</span><span class="p">(</span><span class="bp">cls</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"SparkSession"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the active or default :class:`SparkSession` for the current thread, returned by</span> |
| <span class="sd"> the builder.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`SparkSession`</span> |
| <span class="sd"> Spark session if an active or default session exists for the current thread.</span> |
| <span class="sd"> """</span> |
| <span class="n">session</span> <span class="o">=</span> <span class="bp">cls</span><span class="o">.</span><span class="n">getActiveSession</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">session</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">session</span> <span class="o">=</span> <span class="bp">cls</span><span class="o">.</span><span class="n">_instantiatedSession</span> |
| <span class="k">if</span> <span class="n">session</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"NO_ACTIVE_OR_DEFAULT_SESSION"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{},</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">session</span></div> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_remote_only</span><span class="p">():</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">sparkContext</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"SparkContext"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the underlying :class:`SparkContext`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`SparkContext`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.sparkContext</span> |
| <span class="sd"> <SparkContext master=... appName=...></span> |
| |
| <span class="sd"> Create an RDD from the Spark context</span> |
| |
| <span class="sd"> >>> rdd = spark.sparkContext.parallelize([1, 2, 3])</span> |
| <span class="sd"> >>> rdd.collect()</span> |
| <span class="sd"> [1, 2, 3]</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sc</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">version</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> The version of Spark on which this application is running.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> str</span> |
| <span class="sd"> the version of Spark in string.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> _ = spark.version</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">version</span><span class="p">()</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">conf</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">RuntimeConfig</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Runtime configuration interface for Spark.</span> |
| |
| <span class="sd"> This is the interface through which the user can get and set all Spark and Hadoop</span> |
| <span class="sd"> configurations that are relevant to Spark SQL. When getting the value of a config,</span> |
| <span class="sd"> this defaults to the value set in the underlying :class:`SparkContext`, if any.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`pyspark.sql.conf.RuntimeConfig`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.conf</span> |
| <span class="sd"> <pyspark...RuntimeConf...></span> |
| |
| <span class="sd"> Set a runtime configuration for the session</span> |
| |
| <span class="sd"> >>> spark.conf.set("key", "value")</span> |
| <span class="sd"> >>> spark.conf.get("key")</span> |
| <span class="sd"> 'value'</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">"_conf"</span><span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_conf</span> <span class="o">=</span> <span class="n">RuntimeConfig</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">conf</span><span class="p">())</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_conf</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">catalog</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"Catalog"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Interface through which the user may create, drop, alter or query underlying</span> |
| <span class="sd"> databases, tables, functions, etc.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`Catalog`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.catalog</span> |
| <span class="sd"> <...Catalog object ...></span> |
| |
| <span class="sd"> Create a temp view, show the list, and drop it.</span> |
| |
| <span class="sd"> >>> spark.range(1).createTempView("test_view")</span> |
| <span class="sd"> >>> spark.catalog.listTables() # doctest: +SKIP</span> |
| <span class="sd"> [Table(name='test_view', catalog=None, namespace=[], description=None, ...</span> |
| <span class="sd"> >>> _ = spark.catalog.dropTempView("test_view")</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.catalog</span> <span class="kn">import</span> <span class="n">Catalog</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">"_catalog"</span><span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_catalog</span> <span class="o">=</span> <span class="n">Catalog</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_catalog</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">udf</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"UDFRegistration"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a :class:`UDFRegistration` for UDF registration.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`UDFRegistration`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Register a Python UDF, and use it in SQL.</span> |
| |
| <span class="sd"> >>> strlen = spark.udf.register("strlen", lambda x: len(x))</span> |
| <span class="sd"> >>> spark.sql("SELECT strlen('test')").show()</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> |strlen(test)|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> | 4|</span> |
| <span class="sd"> +------------+</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.udf</span> <span class="kn">import</span> <span class="n">UDFRegistration</span> |
| |
| <span class="k">return</span> <span class="n">UDFRegistration</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">udtf</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"UDTFRegistration"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a :class:`UDTFRegistration` for UDTF registration.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`UDTFRegistration`</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.udtf</span> <span class="kn">import</span> <span class="n">UDTFRegistration</span> |
| |
| <span class="k">return</span> <span class="n">UDTFRegistration</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">dataSource</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataSourceRegistration"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a :class:`DataSourceRegistration` for data source registration.</span> |
| |
| <span class="sd"> .. versionadded:: 4.0.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataSourceRegistration`</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This feature is experimental and unstable.</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.datasource</span> <span class="kn">import</span> <span class="n">DataSourceRegistration</span> |
| |
| <span class="k">return</span> <span class="n">DataSourceRegistration</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">profile</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Profile</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a :class:`Profile` for performance/memory profiling.</span> |
| |
| <span class="sd"> .. versionadded:: 4.0.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`Profile`</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">Profile</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_profiler_collector</span><span class="p">)</span> |
| |
| <div class="viewcode-block" id="SparkSession.range"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.range.html#pyspark.sql.SparkSession.range">[docs]</a> <span class="k">def</span> <span class="nf">range</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">start</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> |
| <span class="n">end</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">step</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">1</span><span class="p">,</span> |
| <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Create a :class:`DataFrame` with single :class:`pyspark.sql.types.LongType` column named</span> |
| <span class="sd"> ``id``, containing elements in a range from ``start`` to ``end`` (exclusive) with</span> |
| <span class="sd"> step value ``step``.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> start : int</span> |
| <span class="sd"> the start value</span> |
| <span class="sd"> end : int, optional</span> |
| <span class="sd"> the end value (exclusive)</span> |
| <span class="sd"> step : int, optional</span> |
| <span class="sd"> the incremental step (default: 1)</span> |
| <span class="sd"> numPartitions : int, optional</span> |
| <span class="sd"> the number of partitions of the DataFrame</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.range(1, 7, 2).show()</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | id|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> | 5|</span> |
| <span class="sd"> +---+</span> |
| |
| <span class="sd"> If only one argument is specified, it will be used as the end value.</span> |
| |
| <span class="sd"> >>> spark.range(3).show()</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | id|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">numPartitions</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">numPartitions</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="o">.</span><span class="n">defaultParallelism</span> |
| |
| <span class="k">if</span> <span class="n">end</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="nb">int</span><span class="p">(</span><span class="n">start</span><span class="p">),</span> <span class="nb">int</span><span class="p">(</span><span class="n">step</span><span class="p">),</span> <span class="nb">int</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">))</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">range</span><span class="p">(</span><span class="nb">int</span><span class="p">(</span><span class="n">start</span><span class="p">),</span> <span class="nb">int</span><span class="p">(</span><span class="n">end</span><span class="p">),</span> <span class="nb">int</span><span class="p">(</span><span class="n">step</span><span class="p">),</span> <span class="nb">int</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">))</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_inferSchemaFromList</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">data</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Any</span><span class="p">],</span> <span class="n">names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">StructType</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Infer schema from list of Row, dict, or tuple.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> data : iterable</span> |
| <span class="sd"> list of Row, dict, or tuple</span> |
| <span class="sd"> names : list, optional</span> |
| <span class="sd"> list of column names</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`pyspark.sql.types.StructType`</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">data</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"CANNOT_INFER_EMPTY_SCHEMA"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{},</span> |
| <span class="p">)</span> |
| <span class="n">infer_dict_as_struct</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jconf</span><span class="o">.</span><span class="n">inferDictAsStruct</span><span class="p">()</span> |
| <span class="n">infer_array_from_first_element</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jconf</span><span class="o">.</span><span class="n">legacyInferArrayTypeFromFirstElement</span><span class="p">()</span> |
| <span class="n">infer_map_from_first_pair</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jconf</span><span class="o">.</span><span class="n">legacyInferMapStructTypeFromFirstItem</span><span class="p">()</span> |
| <span class="n">prefer_timestamp_ntz</span> <span class="o">=</span> <span class="n">is_timestamp_ntz_preferred</span><span class="p">()</span> |
| <span class="n">schema</span> <span class="o">=</span> <span class="n">reduce</span><span class="p">(</span> |
| <span class="n">_merge_type</span><span class="p">,</span> |
| <span class="p">(</span> |
| <span class="n">_infer_schema</span><span class="p">(</span> |
| <span class="n">row</span><span class="p">,</span> |
| <span class="n">names</span><span class="p">,</span> |
| <span class="n">infer_dict_as_struct</span><span class="o">=</span><span class="n">infer_dict_as_struct</span><span class="p">,</span> |
| <span class="n">infer_array_from_first_element</span><span class="o">=</span><span class="n">infer_array_from_first_element</span><span class="p">,</span> |
| <span class="n">infer_map_from_first_pair</span><span class="o">=</span><span class="n">infer_map_from_first_pair</span><span class="p">,</span> |
| <span class="n">prefer_timestamp_ntz</span><span class="o">=</span><span class="n">prefer_timestamp_ntz</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">for</span> <span class="n">row</span> <span class="ow">in</span> <span class="n">data</span> |
| <span class="p">),</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">_has_nulltype</span><span class="p">(</span><span class="n">schema</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"CANNOT_DETERMINE_TYPE"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{},</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">schema</span> |
| |
| <span class="k">def</span> <span class="nf">_inferSchema</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">rdd</span><span class="p">:</span> <span class="s2">"RDD[Any]"</span><span class="p">,</span> |
| <span class="n">samplingRatio</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">names</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">StructType</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Infer schema from an RDD of Row, dict, or tuple.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> rdd : :class:`RDD`</span> |
| <span class="sd"> an RDD of Row, dict, or tuple</span> |
| <span class="sd"> samplingRatio : float, optional</span> |
| <span class="sd"> sampling ratio, or no sampling (default)</span> |
| <span class="sd"> names : list, optional</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`pyspark.sql.types.StructType`</span> |
| <span class="sd"> """</span> |
| <span class="n">first</span> <span class="o">=</span> <span class="n">rdd</span><span class="o">.</span><span class="n">first</span><span class="p">()</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">first</span><span class="p">,</span> <span class="n">Sized</span><span class="p">)</span> <span class="ow">and</span> <span class="nb">len</span><span class="p">(</span><span class="n">first</span><span class="p">)</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"CANNOT_INFER_EMPTY_SCHEMA"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{},</span> |
| <span class="p">)</span> |
| |
| <span class="n">infer_dict_as_struct</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jconf</span><span class="o">.</span><span class="n">inferDictAsStruct</span><span class="p">()</span> |
| <span class="n">infer_array_from_first_element</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jconf</span><span class="o">.</span><span class="n">legacyInferArrayTypeFromFirstElement</span><span class="p">()</span> |
| <span class="n">infer_map_from_first_pair</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jconf</span><span class="o">.</span><span class="n">legacyInferMapStructTypeFromFirstItem</span><span class="p">()</span> |
| <span class="n">prefer_timestamp_ntz</span> <span class="o">=</span> <span class="n">is_timestamp_ntz_preferred</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">samplingRatio</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">schema</span> <span class="o">=</span> <span class="n">_infer_schema</span><span class="p">(</span> |
| <span class="n">first</span><span class="p">,</span> |
| <span class="n">names</span><span class="o">=</span><span class="n">names</span><span class="p">,</span> |
| <span class="n">infer_dict_as_struct</span><span class="o">=</span><span class="n">infer_dict_as_struct</span><span class="p">,</span> |
| <span class="n">prefer_timestamp_ntz</span><span class="o">=</span><span class="n">prefer_timestamp_ntz</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">_has_nulltype</span><span class="p">(</span><span class="n">schema</span><span class="p">):</span> |
| <span class="k">for</span> <span class="n">row</span> <span class="ow">in</span> <span class="n">rdd</span><span class="o">.</span><span class="n">take</span><span class="p">(</span><span class="mi">100</span><span class="p">)[</span><span class="mi">1</span><span class="p">:]:</span> |
| <span class="n">schema</span> <span class="o">=</span> <span class="n">_merge_type</span><span class="p">(</span> |
| <span class="n">schema</span><span class="p">,</span> |
| <span class="n">_infer_schema</span><span class="p">(</span> |
| <span class="n">row</span><span class="p">,</span> |
| <span class="n">names</span><span class="o">=</span><span class="n">names</span><span class="p">,</span> |
| <span class="n">infer_dict_as_struct</span><span class="o">=</span><span class="n">infer_dict_as_struct</span><span class="p">,</span> |
| <span class="n">infer_array_from_first_element</span><span class="o">=</span><span class="n">infer_array_from_first_element</span><span class="p">,</span> |
| <span class="n">infer_map_from_first_pair</span><span class="o">=</span><span class="n">infer_map_from_first_pair</span><span class="p">,</span> |
| <span class="n">prefer_timestamp_ntz</span><span class="o">=</span><span class="n">prefer_timestamp_ntz</span><span class="p">,</span> |
| <span class="p">),</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">_has_nulltype</span><span class="p">(</span><span class="n">schema</span><span class="p">):</span> |
| <span class="k">break</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"CANNOT_DETERMINE_TYPE"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{},</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">if</span> <span class="n">samplingRatio</span> <span class="o"><</span> <span class="mf">0.99</span><span class="p">:</span> |
| <span class="n">rdd</span> <span class="o">=</span> <span class="n">rdd</span><span class="o">.</span><span class="n">sample</span><span class="p">(</span><span class="kc">False</span><span class="p">,</span> <span class="nb">float</span><span class="p">(</span><span class="n">samplingRatio</span><span class="p">))</span> |
| <span class="n">schema</span> <span class="o">=</span> <span class="n">rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span> |
| <span class="k">lambda</span> <span class="n">row</span><span class="p">:</span> <span class="n">_infer_schema</span><span class="p">(</span> |
| <span class="n">row</span><span class="p">,</span> |
| <span class="n">names</span><span class="p">,</span> |
| <span class="n">infer_dict_as_struct</span><span class="o">=</span><span class="n">infer_dict_as_struct</span><span class="p">,</span> |
| <span class="n">infer_array_from_first_element</span><span class="o">=</span><span class="n">infer_array_from_first_element</span><span class="p">,</span> |
| <span class="n">infer_map_from_first_pair</span><span class="o">=</span><span class="n">infer_map_from_first_pair</span><span class="p">,</span> |
| <span class="n">prefer_timestamp_ntz</span><span class="o">=</span><span class="n">prefer_timestamp_ntz</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="p">)</span><span class="o">.</span><span class="n">reduce</span><span class="p">(</span><span class="n">_merge_type</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">schema</span> |
| |
| <span class="k">def</span> <span class="nf">_createFromRDD</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">rdd</span><span class="p">:</span> <span class="s2">"RDD[Any]"</span><span class="p">,</span> |
| <span class="n">schema</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">DataType</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]],</span> |
| <span class="n">samplingRatio</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"RDD[Tuple]"</span><span class="p">,</span> <span class="n">StructType</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Create an RDD for DataFrame from an existing RDD, returns the RDD and schema.</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="n">schema</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span> |
| <span class="n">struct</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_inferSchema</span><span class="p">(</span><span class="n">rdd</span><span class="p">,</span> <span class="n">samplingRatio</span><span class="p">,</span> <span class="n">names</span><span class="o">=</span><span class="n">schema</span><span class="p">)</span> |
| <span class="n">converter</span> <span class="o">=</span> <span class="n">_create_converter</span><span class="p">(</span><span class="n">struct</span><span class="p">)</span> |
| <span class="n">tupled_rdd</span> <span class="o">=</span> <span class="n">rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">converter</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">schema</span><span class="p">):</span> |
| <span class="n">struct</span><span class="o">.</span><span class="n">fields</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span> |
| <span class="n">struct</span><span class="o">.</span><span class="n">names</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="n">name</span> |
| |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">StructType</span><span class="p">):</span> |
| <span class="n">struct</span> <span class="o">=</span> <span class="n">schema</span> |
| <span class="n">tupled_rdd</span> <span class="o">=</span> <span class="n">rdd</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"NOT_LIST_OR_NONE_OR_STRUCT"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span> |
| <span class="s2">"arg_name"</span><span class="p">:</span> <span class="s2">"schema"</span><span class="p">,</span> |
| <span class="s2">"arg_type"</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">,</span> |
| <span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># convert python objects to sql data</span> |
| <span class="n">internal_rdd</span> <span class="o">=</span> <span class="n">tupled_rdd</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">struct</span><span class="o">.</span><span class="n">toInternal</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">internal_rdd</span><span class="p">,</span> <span class="n">struct</span> |
| |
| <span class="k">def</span> <span class="nf">_createFromLocal</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">data</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Any</span><span class="p">],</span> <span class="n">schema</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">DataType</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"RDD[Tuple]"</span><span class="p">,</span> <span class="n">StructType</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Create an RDD for DataFrame from a list or pandas.DataFrame, returns</span> |
| <span class="sd"> the RDD and schema.</span> |
| <span class="sd"> """</span> |
| <span class="c1"># make sure data could consumed multiple times</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="nb">list</span><span class="p">):</span> |
| <span class="n">data</span> <span class="o">=</span> <span class="nb">list</span><span class="p">(</span><span class="n">data</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">schema</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span> |
| <span class="n">struct</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_inferSchemaFromList</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">names</span><span class="o">=</span><span class="n">schema</span><span class="p">)</span> |
| <span class="n">converter</span> <span class="o">=</span> <span class="n">_create_converter</span><span class="p">(</span><span class="n">struct</span><span class="p">)</span> |
| <span class="n">tupled_data</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Tuple</span><span class="p">]</span> <span class="o">=</span> <span class="nb">map</span><span class="p">(</span><span class="n">converter</span><span class="p">,</span> <span class="n">data</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span> |
| <span class="k">for</span> <span class="n">i</span><span class="p">,</span> <span class="n">name</span> <span class="ow">in</span> <span class="nb">enumerate</span><span class="p">(</span><span class="n">schema</span><span class="p">):</span> |
| <span class="n">struct</span><span class="o">.</span><span class="n">fields</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">name</span> |
| <span class="n">struct</span><span class="o">.</span><span class="n">names</span><span class="p">[</span><span class="n">i</span><span class="p">]</span> <span class="o">=</span> <span class="n">name</span> |
| |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">StructType</span><span class="p">):</span> |
| <span class="n">struct</span> <span class="o">=</span> <span class="n">schema</span> |
| <span class="n">tupled_data</span> <span class="o">=</span> <span class="n">data</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"NOT_LIST_OR_NONE_OR_STRUCT"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span> |
| <span class="s2">"arg_name"</span><span class="p">:</span> <span class="s2">"schema"</span><span class="p">,</span> |
| <span class="s2">"arg_type"</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">,</span> |
| <span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="c1"># convert python objects to sql data</span> |
| <span class="n">internal_data</span> <span class="o">=</span> <span class="p">[</span><span class="n">struct</span><span class="o">.</span><span class="n">toInternal</span><span class="p">(</span><span class="n">row</span><span class="p">)</span> <span class="k">for</span> <span class="n">row</span> <span class="ow">in</span> <span class="n">tupled_data</span><span class="p">]</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="o">.</span><span class="n">parallelize</span><span class="p">(</span><span class="n">internal_data</span><span class="p">),</span> <span class="n">struct</span> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_create_shell_session</span><span class="p">()</span> <span class="o">-></span> <span class="s2">"SparkSession"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Initialize a :class:`SparkSession` for a pyspark shell session. This is called from</span> |
| <span class="sd"> shell.py to make error handling simpler without needing to declare local variables in</span> |
| <span class="sd"> that script, which would expose those to users.</span> |
| <span class="sd"> """</span> |
| <span class="kn">import</span> <span class="nn">py4j</span> |
| <span class="kn">from</span> <span class="nn">pyspark.core.context</span> <span class="kn">import</span> <span class="n">SparkContext</span> |
| |
| <span class="k">try</span><span class="p">:</span> |
| <span class="c1"># Try to access HiveConf, it will raise exception if Hive is not added</span> |
| <span class="n">conf</span> <span class="o">=</span> <span class="n">SparkConf</span><span class="p">()</span> |
| <span class="k">assert</span> <span class="n">SparkContext</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| <span class="k">if</span> <span class="n">conf</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"spark.sql.catalogImplementation"</span><span class="p">,</span> <span class="s2">"hive"</span><span class="p">)</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="o">==</span> <span class="s2">"hive"</span><span class="p">:</span> |
| <span class="n">SparkContext</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">org</span><span class="o">.</span><span class="n">apache</span><span class="o">.</span><span class="n">hadoop</span><span class="o">.</span><span class="n">hive</span><span class="o">.</span><span class="n">conf</span><span class="o">.</span><span class="n">HiveConf</span><span class="p">()</span> |
| <span class="k">return</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">enableHiveSupport</span><span class="p">()</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">_getActiveSessionOrCreate</span><span class="p">()</span> |
| <span class="k">except</span> <span class="p">(</span><span class="n">py4j</span><span class="o">.</span><span class="n">protocol</span><span class="o">.</span><span class="n">Py4JError</span><span class="p">,</span> <span class="ne">TypeError</span><span class="p">):</span> |
| <span class="k">if</span> <span class="n">conf</span><span class="o">.</span><span class="n">get</span><span class="p">(</span><span class="s2">"spark.sql.catalogImplementation"</span><span class="p">,</span> <span class="s2">""</span><span class="p">)</span><span class="o">.</span><span class="n">lower</span><span class="p">()</span> <span class="o">==</span> <span class="s2">"hive"</span><span class="p">:</span> |
| <span class="n">warnings</span><span class="o">.</span><span class="n">warn</span><span class="p">(</span> |
| <span class="s2">"Fall back to non-hive support because failing to access HiveConf, "</span> |
| <span class="s2">"please make sure you build spark with hive"</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">_getActiveSessionOrCreate</span><span class="p">()</span> |
| |
| <span class="nd">@staticmethod</span> |
| <span class="k">def</span> <span class="nf">_getActiveSessionOrCreate</span><span class="p">(</span><span class="o">**</span><span class="n">static_conf</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"SparkSession"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the active :class:`SparkSession` for the current thread, returned by the builder,</span> |
| <span class="sd"> or if there is no existing one, creates a new one based on the options set in the builder.</span> |
| |
| <span class="sd"> NOTE that 'static_conf' might not be set if there's an active or default Spark session</span> |
| <span class="sd"> running.</span> |
| <span class="sd"> """</span> |
| <span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">getActiveSession</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">spark</span> <span class="ow">is</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">builder</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span> |
| <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="n">static_conf</span><span class="o">.</span><span class="n">items</span><span class="p">():</span> |
| <span class="n">builder</span> <span class="o">=</span> <span class="n">builder</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="n">k</span><span class="p">,</span> <span class="n">v</span><span class="p">)</span> |
| <span class="n">spark</span> <span class="o">=</span> <span class="n">builder</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| <span class="k">return</span> <span class="n">spark</span> |
| |
| <span class="nd">@overload</span> <span class="c1"># type: ignore[override]</span> |
| <span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">data</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="s2">"RowLike"</span><span class="p">],</span> |
| <span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="o">...</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">samplingRatio</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">data</span><span class="p">:</span> <span class="s2">"RDD[RowLike]"</span><span class="p">,</span> |
| <span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="o">...</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="n">samplingRatio</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">data</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="s2">"RowLike"</span><span class="p">],</span> |
| <span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">StructType</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">verifySchema</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">data</span><span class="p">:</span> <span class="s2">"RDD[RowLike]"</span><span class="p">,</span> |
| <span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">StructType</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">verifySchema</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">data</span><span class="p">:</span> <span class="s2">"RDD[AtomicValue]"</span><span class="p">,</span> |
| <span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">AtomicType</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> |
| <span class="n">verifySchema</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">data</span><span class="p">:</span> <span class="n">Iterable</span><span class="p">[</span><span class="s2">"AtomicValue"</span><span class="p">],</span> |
| <span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">AtomicType</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> |
| <span class="n">verifySchema</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">data</span><span class="p">:</span> <span class="s2">"PandasDataFrameLike"</span><span class="p">,</span> <span class="n">samplingRatio</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">data</span><span class="p">:</span> <span class="s2">"pa.Table"</span><span class="p">,</span> <span class="n">samplingRatio</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">data</span><span class="p">:</span> <span class="s2">"PandasDataFrameLike"</span><span class="p">,</span> |
| <span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">StructType</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> |
| <span class="n">verifySchema</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">data</span><span class="p">:</span> <span class="s2">"pa.Table"</span><span class="p">,</span> |
| <span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">StructType</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> |
| <span class="n">verifySchema</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="SparkSession.createDataFrame"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.createDataFrame.html#pyspark.sql.SparkSession.createDataFrame">[docs]</a> <span class="k">def</span> <span class="nf">createDataFrame</span><span class="p">(</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">data</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"RDD[Any]"</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Any</span><span class="p">],</span> <span class="s2">"PandasDataFrameLike"</span><span class="p">,</span> <span class="s2">"ArrayLike"</span><span class="p">,</span> <span class="s2">"pa.Table"</span><span class="p">],</span> |
| <span class="n">schema</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">AtomicType</span><span class="p">,</span> <span class="n">StructType</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">samplingRatio</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">verifySchema</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Creates a :class:`DataFrame` from an :class:`RDD`, a list, a :class:`pandas.DataFrame`,</span> |
| <span class="sd"> a :class:`numpy.ndarray`, or a :class:`pyarrow.Table`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> .. versionchanged:: 4.0.0</span> |
| <span class="sd"> Supports :class:`pyarrow.Table`.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> data : :class:`RDD` or iterable</span> |
| <span class="sd"> an RDD of any kind of SQL data representation (:class:`Row`,</span> |
| <span class="sd"> :class:`tuple`, ``int``, ``boolean``, ``dict``, etc.), or :class:`list`,</span> |
| <span class="sd"> :class:`pandas.DataFrame`, :class:`numpy.ndarray`, or :class:`pyarrow.Table`.</span> |
| <span class="sd"> schema : :class:`pyspark.sql.types.DataType`, str or list, optional</span> |
| <span class="sd"> a :class:`pyspark.sql.types.DataType` or a datatype string or a list of</span> |
| <span class="sd"> column names, default is None. The data type string format equals to</span> |
| <span class="sd"> :class:`pyspark.sql.types.DataType.simpleString`, except that top level struct type can</span> |
| <span class="sd"> omit the ``struct<>``.</span> |
| |
| <span class="sd"> When ``schema`` is a list of column names, the type of each column</span> |
| <span class="sd"> will be inferred from ``data``.</span> |
| |
| <span class="sd"> When ``schema`` is ``None``, it will try to infer the schema (column names and types)</span> |
| <span class="sd"> from ``data``, which should be an RDD of either :class:`Row`,</span> |
| <span class="sd"> :class:`namedtuple`, or :class:`dict`.</span> |
| |
| <span class="sd"> When ``schema`` is :class:`pyspark.sql.types.DataType` or a datatype string, it must</span> |
| <span class="sd"> match the real data, or an exception will be thrown at runtime. If the given schema is</span> |
| <span class="sd"> not :class:`pyspark.sql.types.StructType`, it will be wrapped into a</span> |
| <span class="sd"> :class:`pyspark.sql.types.StructType` as its only field, and the field name will be</span> |
| <span class="sd"> "value". Each record will also be wrapped into a tuple, which can be converted to row</span> |
| <span class="sd"> later.</span> |
| <span class="sd"> samplingRatio : float, optional</span> |
| <span class="sd"> the sample ratio of rows used for inferring. The first few rows will be used</span> |
| <span class="sd"> if ``samplingRatio`` is ``None``. This option is effective only when the input is</span> |
| <span class="sd"> :class:`RDD`.</span> |
| <span class="sd"> verifySchema : bool, optional</span> |
| <span class="sd"> verify data types of every row against schema. Enabled by default.</span> |
| <span class="sd"> When the input is :class:`pyarrow.Table` or when the input class is</span> |
| <span class="sd"> :class:`pandas.DataFrame` and `spark.sql.execution.arrow.pyspark.enabled` is enabled,</span> |
| <span class="sd"> this option is not effective. It follows Arrow type coercion. This option is not</span> |
| <span class="sd"> supported with Spark Connect.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Usage with `spark.sql.execution.arrow.pyspark.enabled=True` is experimental.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Create a DataFrame from a list of tuples.</span> |
| |
| <span class="sd"> >>> spark.createDataFrame([('Alice', 1)]).show()</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> | _1| _2|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> |Alice| 1|</span> |
| <span class="sd"> +-----+---+</span> |
| |
| <span class="sd"> Create a DataFrame from a list of dictionaries.</span> |
| |
| <span class="sd"> >>> d = [{'name': 'Alice', 'age': 1}]</span> |
| <span class="sd"> >>> spark.createDataFrame(d).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 1|Alice|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> Create a DataFrame with column names specified.</span> |
| |
| <span class="sd"> >>> spark.createDataFrame([('Alice', 1)], ['name', 'age']).show()</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> | name|age|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> |Alice| 1|</span> |
| <span class="sd"> +-----+---+</span> |
| |
| <span class="sd"> Create a DataFrame with the explicit schema specified.</span> |
| |
| <span class="sd"> >>> from pyspark.sql.types import *</span> |
| <span class="sd"> >>> schema = StructType([</span> |
| <span class="sd"> ... StructField("name", StringType(), True),</span> |
| <span class="sd"> ... StructField("age", IntegerType(), True)])</span> |
| <span class="sd"> >>> spark.createDataFrame([('Alice', 1)], schema).show()</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> | name|age|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> |Alice| 1|</span> |
| <span class="sd"> +-----+---+</span> |
| |
| <span class="sd"> Create a DataFrame with the schema in DDL formatted string.</span> |
| |
| <span class="sd"> >>> spark.createDataFrame([('Alice', 1)], "name: string, age: int").show()</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> | name|age|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> |Alice| 1|</span> |
| <span class="sd"> +-----+---+</span> |
| |
| <span class="sd"> Create an empty DataFrame.</span> |
| <span class="sd"> When initializing an empty DataFrame in PySpark, it's mandatory to specify its schema,</span> |
| <span class="sd"> as the DataFrame lacks data from which the schema can be inferred.</span> |
| |
| <span class="sd"> >>> spark.createDataFrame([], "name: string, age: int").show()</span> |
| <span class="sd"> +----+---+</span> |
| <span class="sd"> |name|age|</span> |
| <span class="sd"> +----+---+</span> |
| <span class="sd"> +----+---+</span> |
| |
| <span class="sd"> Create a DataFrame from Row objects.</span> |
| |
| <span class="sd"> >>> from pyspark.sql import Row</span> |
| <span class="sd"> >>> Person = Row('name', 'age')</span> |
| <span class="sd"> >>> df = spark.createDataFrame([Person("Alice", 1)])</span> |
| <span class="sd"> >>> df.show()</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> | name|age|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> |Alice| 1|</span> |
| <span class="sd"> +-----+---+</span> |
| |
| <span class="sd"> Create a DataFrame from a pandas DataFrame.</span> |
| |
| <span class="sd"> >>> spark.createDataFrame(df.toPandas()).show() # doctest: +SKIP</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> | name|age|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> |Alice| 1|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> >>> spark.createDataFrame(pandas.DataFrame([[1, 2]])).collect() # doctest: +SKIP</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | 0| 1|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | 1| 2|</span> |
| <span class="sd"> +---+---+</span> |
| |
| <span class="sd"> Create a DataFrame from a PyArrow Table.</span> |
| |
| <span class="sd"> >>> spark.createDataFrame(df.toArrow()).show() # doctest: +SKIP</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> | name|age|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> |Alice| 1|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> >>> table = pyarrow.table({'0': [1], '1': [2]}) # doctest: +SKIP</span> |
| <span class="sd"> >>> spark.createDataFrame(table).collect() # doctest: +SKIP</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | 0| 1|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | 1| 2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> """</span> |
| <span class="n">SparkSession</span><span class="o">.</span><span class="n">_activeSession</span> <span class="o">=</span> <span class="bp">self</span> |
| <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">setActiveSession</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">DataFrame</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"INVALID_TYPE"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"arg_name"</span><span class="p">:</span> <span class="s2">"data"</span><span class="p">,</span> <span class="s2">"arg_type"</span><span class="p">:</span> <span class="s2">"DataFrame"</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="n">schema</span> <span class="o">=</span> <span class="n">cast</span><span class="p">(</span><span class="n">Union</span><span class="p">[</span><span class="n">AtomicType</span><span class="p">,</span> <span class="n">StructType</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> <span class="n">_parse_datatype_string</span><span class="p">(</span><span class="n">schema</span><span class="p">))</span> |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="p">(</span><span class="nb">list</span><span class="p">,</span> <span class="nb">tuple</span><span class="p">)):</span> |
| <span class="c1"># Must re-encode any unicode strings to be consistent with StructField names</span> |
| <span class="n">schema</span> <span class="o">=</span> <span class="p">[</span><span class="n">x</span><span class="o">.</span><span class="n">encode</span><span class="p">(</span><span class="s2">"utf-8"</span><span class="p">)</span> <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="nb">str</span><span class="p">)</span> <span class="k">else</span> <span class="n">x</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">schema</span><span class="p">]</span> |
| |
| <span class="k">try</span><span class="p">:</span> |
| <span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span> |
| |
| <span class="n">has_pandas</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="k">except</span> <span class="ne">Exception</span><span class="p">:</span> |
| <span class="n">has_pandas</span> <span class="o">=</span> <span class="kc">False</span> |
| |
| <span class="k">try</span><span class="p">:</span> |
| <span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span> |
| |
| <span class="n">has_numpy</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="k">except</span> <span class="ne">Exception</span><span class="p">:</span> |
| <span class="n">has_numpy</span> <span class="o">=</span> <span class="kc">False</span> |
| |
| <span class="k">try</span><span class="p">:</span> |
| <span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span> |
| |
| <span class="n">has_pyarrow</span> <span class="o">=</span> <span class="kc">True</span> |
| <span class="k">except</span> <span class="ne">Exception</span><span class="p">:</span> |
| <span class="n">has_pyarrow</span> <span class="o">=</span> <span class="kc">False</span> |
| |
| <span class="k">if</span> <span class="n">has_numpy</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">np</span><span class="o">.</span><span class="n">ndarray</span><span class="p">):</span> |
| <span class="c1"># `data` of numpy.ndarray type will be converted to a pandas DataFrame,</span> |
| <span class="c1"># so pandas is required.</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.pandas.utils</span> <span class="kn">import</span> <span class="n">require_minimum_pandas_version</span> |
| |
| <span class="n">require_minimum_pandas_version</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">data</span><span class="o">.</span><span class="n">ndim</span> <span class="ow">not</span> <span class="ow">in</span> <span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">]:</span> |
| <span class="k">raise</span> <span class="n">PySparkValueError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"INVALID_NDARRAY_DIMENSION"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"dimensions"</span><span class="p">:</span> <span class="s2">"1 or 2"</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">data</span><span class="o">.</span><span class="n">ndim</span> <span class="o">==</span> <span class="mi">1</span> <span class="ow">or</span> <span class="n">data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span> |
| <span class="n">column_names</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"value"</span><span class="p">]</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">column_names</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"_</span><span class="si">%s</span><span class="s2">"</span> <span class="o">%</span> <span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">data</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">+</span> <span class="mi">1</span><span class="p">)]</span> |
| |
| <span class="k">if</span> <span class="n">schema</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">and</span> <span class="ow">not</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jconf</span><span class="o">.</span><span class="n">arrowPySparkEnabled</span><span class="p">():</span> |
| <span class="c1"># Construct `schema` from `np.dtype` of the input NumPy array</span> |
| <span class="c1"># TODO: Apply the logic below when self._jconf.arrowPySparkEnabled() is True</span> |
| <span class="n">spark_type</span> <span class="o">=</span> <span class="n">_from_numpy_type</span><span class="p">(</span><span class="n">data</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">spark_type</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span><span class="p">:</span> |
| <span class="n">schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">StructField</span><span class="p">(</span><span class="n">name</span><span class="p">,</span> <span class="n">spark_type</span><span class="p">,</span> <span class="n">nullable</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="k">for</span> <span class="n">name</span> <span class="ow">in</span> <span class="n">column_names</span><span class="p">]</span> |
| <span class="p">)</span> |
| |
| <span class="n">data</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="n">column_names</span><span class="p">)</span> |
| |
| <span class="k">if</span> <span class="n">has_pandas</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">):</span> |
| <span class="c1"># Create a DataFrame from pandas DataFrame.</span> |
| <span class="k">return</span> <span class="nb">super</span><span class="p">(</span><span class="n">SparkSession</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span> <span class="c1"># type: ignore[call-overload]</span> |
| <span class="n">data</span><span class="p">,</span> <span class="n">schema</span><span class="p">,</span> <span class="n">samplingRatio</span><span class="p">,</span> <span class="n">verifySchema</span> |
| <span class="p">)</span> |
| <span class="k">if</span> <span class="n">has_pyarrow</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">pa</span><span class="o">.</span><span class="n">Table</span><span class="p">):</span> |
| <span class="c1"># Create a DataFrame from PyArrow Table.</span> |
| <span class="k">return</span> <span class="nb">super</span><span class="p">(</span><span class="n">SparkSession</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span> <span class="c1"># type: ignore[call-overload]</span> |
| <span class="n">data</span><span class="p">,</span> <span class="n">schema</span><span class="p">,</span> <span class="n">samplingRatio</span><span class="p">,</span> <span class="n">verifySchema</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_create_dataframe</span><span class="p">(</span> |
| <span class="n">data</span><span class="p">,</span> <span class="n">schema</span><span class="p">,</span> <span class="n">samplingRatio</span><span class="p">,</span> <span class="n">verifySchema</span> <span class="c1"># type: ignore[arg-type]</span> |
| <span class="p">)</span></div> |
| |
| <span class="k">def</span> <span class="nf">_create_dataframe</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">data</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"RDD[Any]"</span><span class="p">,</span> <span class="n">Iterable</span><span class="p">[</span><span class="n">Any</span><span class="p">]],</span> |
| <span class="n">schema</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">DataType</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]],</span> |
| <span class="n">samplingRatio</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> |
| <span class="n">verifySchema</span><span class="p">:</span> <span class="nb">bool</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">StructType</span><span class="p">):</span> |
| <span class="n">verify_func</span> <span class="o">=</span> <span class="n">_make_type_verifier</span><span class="p">(</span><span class="n">schema</span><span class="p">)</span> <span class="k">if</span> <span class="n">verifySchema</span> <span class="k">else</span> <span class="k">lambda</span> <span class="n">_</span><span class="p">:</span> <span class="kc">True</span> |
| |
| <span class="nd">@no_type_check</span> |
| <span class="k">def</span> <span class="nf">prepare</span><span class="p">(</span><span class="n">obj</span><span class="p">):</span> |
| <span class="n">verify_func</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">obj</span> |
| |
| <span class="k">elif</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">schema</span><span class="p">,</span> <span class="n">DataType</span><span class="p">):</span> |
| <span class="n">dataType</span> <span class="o">=</span> <span class="n">schema</span> |
| <span class="n">schema</span> <span class="o">=</span> <span class="n">StructType</span><span class="p">()</span><span class="o">.</span><span class="n">add</span><span class="p">(</span><span class="s2">"value"</span><span class="p">,</span> <span class="n">schema</span><span class="p">)</span> |
| |
| <span class="n">verify_func</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">_make_type_verifier</span><span class="p">(</span><span class="n">dataType</span><span class="p">,</span> <span class="n">name</span><span class="o">=</span><span class="s2">"field value"</span><span class="p">)</span> |
| <span class="k">if</span> <span class="n">verifySchema</span> |
| <span class="k">else</span> <span class="k">lambda</span> <span class="n">_</span><span class="p">:</span> <span class="kc">True</span> |
| <span class="p">)</span> |
| |
| <span class="nd">@no_type_check</span> |
| <span class="k">def</span> <span class="nf">prepare</span><span class="p">(</span><span class="n">obj</span><span class="p">):</span> |
| <span class="n">verify_func</span><span class="p">(</span><span class="n">obj</span><span class="p">)</span> |
| <span class="k">return</span> <span class="p">(</span><span class="n">obj</span><span class="p">,)</span> |
| |
| <span class="k">else</span><span class="p">:</span> |
| |
| <span class="k">def</span> <span class="nf">prepare</span><span class="p">(</span><span class="n">obj</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="n">Any</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">obj</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_remote_only</span><span class="p">():</span> |
| <span class="kn">from</span> <span class="nn">pyspark.core.rdd</span> <span class="kn">import</span> <span class="n">RDD</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_remote_only</span><span class="p">()</span> <span class="ow">and</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">data</span><span class="p">,</span> <span class="n">RDD</span><span class="p">):</span> |
| <span class="n">rdd</span><span class="p">,</span> <span class="n">struct</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_createFromRDD</span><span class="p">(</span><span class="n">data</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="n">prepare</span><span class="p">),</span> <span class="n">schema</span><span class="p">,</span> <span class="n">samplingRatio</span><span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="n">rdd</span><span class="p">,</span> <span class="n">struct</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_createFromLocal</span><span class="p">(</span> |
| <span class="nb">map</span><span class="p">(</span><span class="n">prepare</span><span class="p">,</span> <span class="n">data</span><span class="p">),</span> <span class="n">schema</span> <span class="c1"># type: ignore[arg-type]</span> |
| <span class="p">)</span> |
| <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| <span class="n">jrdd</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SerDeUtil</span><span class="o">.</span><span class="n">toJavaArray</span><span class="p">(</span><span class="n">rdd</span><span class="o">.</span><span class="n">_to_java_object_rdd</span><span class="p">())</span> |
| <span class="n">jdf</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">applySchemaToPythonRDD</span><span class="p">(</span><span class="n">jrdd</span><span class="o">.</span><span class="n">rdd</span><span class="p">(),</span> <span class="n">struct</span><span class="o">.</span><span class="n">json</span><span class="p">())</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="p">(</span><span class="n">jdf</span><span class="p">,</span> <span class="bp">self</span><span class="p">)</span> |
| <span class="n">df</span><span class="o">.</span><span class="n">_schema</span> <span class="o">=</span> <span class="n">struct</span> |
| <span class="k">return</span> <span class="n">df</span> |
| |
| <div class="viewcode-block" id="SparkSession.sql"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.sql.html#pyspark.sql.SparkSession.sql">[docs]</a> <span class="k">def</span> <span class="nf">sql</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">sqlQuery</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">args</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">],</span> <span class="n">List</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a :class:`DataFrame` representing the result of the given query.</span> |
| <span class="sd"> When ``kwargs`` is specified, this method formats the given string by using the Python</span> |
| <span class="sd"> standard formatter. The method binds named parameters to SQL literals or</span> |
| <span class="sd"> positional parameters from `args`. It doesn't support named and positional parameters</span> |
| <span class="sd"> in the same SQL query.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect and parameterized SQL.</span> |
| |
| <span class="sd"> .. versionchanged:: 3.5.0</span> |
| <span class="sd"> Added positional parameters.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> sqlQuery : str</span> |
| <span class="sd"> SQL query string.</span> |
| <span class="sd"> args : dict or list</span> |
| <span class="sd"> A dictionary of parameter names to Python objects or a list of Python objects</span> |
| <span class="sd"> that can be converted to SQL literal expressions. See</span> |
| <span class="sd"> `Supported Data Types`_ for supported value types in Python.</span> |
| <span class="sd"> For example, dictionary keys: "rank", "name", "birthdate";</span> |
| <span class="sd"> dictionary or list values: 1, "Steven", datetime.date(2023, 4, 2).</span> |
| <span class="sd"> A value can be also a `Column` of a literal or collection constructor functions such</span> |
| <span class="sd"> as `map()`, `array()`, `struct()`, in that case it is taken as is.</span> |
| |
| <span class="sd"> .. _Supported Data Types: https://spark.apache.org/docs/latest/sql-ref-datatypes.html</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> kwargs : dict</span> |
| <span class="sd"> Other variables that the user wants to set that can be referenced in the query</span> |
| |
| <span class="sd"> .. versionchanged:: 3.3.0</span> |
| <span class="sd"> Added optional argument ``kwargs`` to specify the mapping of variables in the query.</span> |
| <span class="sd"> This feature is experimental and unstable.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> In Spark Classic, a temporary view referenced in `spark.sql` is resolved immediately,</span> |
| <span class="sd"> while in Spark Connect it is lazily analyzed.</span> |
| <span class="sd"> So in Spark Connect if a view is dropped, modified or replaced after `spark.sql`, the</span> |
| <span class="sd"> execution may fail or generate different results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Executing a SQL query.</span> |
| |
| <span class="sd"> >>> spark.sql("SELECT * FROM range(10) where id > 7").show()</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | id|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | 8|</span> |
| <span class="sd"> | 9|</span> |
| <span class="sd"> +---+</span> |
| |
| <span class="sd"> Executing a SQL query with variables as Python formatter standard.</span> |
| |
| <span class="sd"> >>> spark.sql(</span> |
| <span class="sd"> ... "SELECT * FROM range(10) WHERE id > {bound1} AND id < {bound2}", bound1=7, bound2=9</span> |
| <span class="sd"> ... ).show()</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | id|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | 8|</span> |
| <span class="sd"> +---+</span> |
| |
| <span class="sd"> >>> mydf = spark.range(10)</span> |
| <span class="sd"> >>> spark.sql(</span> |
| <span class="sd"> ... "SELECT {col} FROM {mydf} WHERE id IN {x}",</span> |
| <span class="sd"> ... col=mydf.id, mydf=mydf, x=tuple(range(4))).show()</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | id|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> +---+</span> |
| |
| <span class="sd"> >>> spark.sql('''</span> |
| <span class="sd"> ... SELECT m1.a, m2.b</span> |
| <span class="sd"> ... FROM {table1} m1 INNER JOIN {table2} m2</span> |
| <span class="sd"> ... ON m1.key = m2.key</span> |
| <span class="sd"> ... ORDER BY m1.a, m2.b''',</span> |
| <span class="sd"> ... table1=spark.createDataFrame([(1, "a"), (2, "b")], ["a", "key"]),</span> |
| <span class="sd"> ... table2=spark.createDataFrame([(3, "a"), (4, "b"), (5, "b")], ["b", "key"])).show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | a| b|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | 1| 3|</span> |
| <span class="sd"> | 2| 4|</span> |
| <span class="sd"> | 2| 5|</span> |
| <span class="sd"> +---+---+</span> |
| |
| <span class="sd"> Also, it is possible to query using class:`Column` from :class:`DataFrame`.</span> |
| |
| <span class="sd"> >>> mydf = spark.createDataFrame([(1, 4), (2, 4), (3, 6)], ["A", "B"])</span> |
| <span class="sd"> >>> spark.sql("SELECT {df.A}, {df[B]} FROM {df}", df=mydf).show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | A| B|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | 1| 4|</span> |
| <span class="sd"> | 2| 4|</span> |
| <span class="sd"> | 3| 6|</span> |
| <span class="sd"> +---+---+</span> |
| |
| <span class="sd"> And substitute named parameters with the `:` prefix by SQL literals.</span> |
| |
| <span class="sd"> >>> from pyspark.sql.functions import create_map, lit</span> |
| <span class="sd"> >>> spark.sql(</span> |
| <span class="sd"> ... "SELECT *, element_at(:m, 'a') AS C FROM {df} WHERE {df[B]} > :minB",</span> |
| <span class="sd"> ... {"minB" : 5, "m" : create_map(lit('a'), lit(1))}, df=mydf).show()</span> |
| <span class="sd"> +---+---+---+</span> |
| <span class="sd"> | A| B| C|</span> |
| <span class="sd"> +---+---+---+</span> |
| <span class="sd"> | 3| 6| 1|</span> |
| <span class="sd"> +---+---+---+</span> |
| |
| <span class="sd"> Or positional parameters marked by `?` in the SQL query by SQL literals.</span> |
| |
| <span class="sd"> >>> from pyspark.sql.functions import array, lit</span> |
| <span class="sd"> >>> spark.sql(</span> |
| <span class="sd"> ... "SELECT *, element_at(?, 1) AS C FROM {df} WHERE {df[B]} > ? and ? < {df[A]}",</span> |
| <span class="sd"> ... args=[array(lit(1), lit(2), lit(3)), 5, 2], df=mydf).show()</span> |
| <span class="sd"> +---+---+---+</span> |
| <span class="sd"> | A| B| C|</span> |
| <span class="sd"> +---+---+---+</span> |
| <span class="sd"> | 3| 6| 1|</span> |
| <span class="sd"> +---+---+---+</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.classic.column</span> <span class="kn">import</span> <span class="n">_to_java_column</span> |
| |
| <span class="n">formatter</span> <span class="o">=</span> <span class="n">SQLStringFormatter</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">kwargs</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">sqlQuery</span> <span class="o">=</span> <span class="n">formatter</span><span class="o">.</span><span class="n">format</span><span class="p">(</span><span class="n">sqlQuery</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">)</span> |
| <span class="k">try</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">args</span><span class="p">,</span> <span class="n">Dict</span><span class="p">):</span> |
| <span class="n">litArgs</span> <span class="o">=</span> <span class="p">{</span><span class="n">k</span><span class="p">:</span> <span class="n">_to_java_column</span><span class="p">(</span><span class="n">lit</span><span class="p">(</span><span class="n">v</span><span class="p">))</span> <span class="k">for</span> <span class="n">k</span><span class="p">,</span> <span class="n">v</span> <span class="ow">in</span> <span class="p">(</span><span class="n">args</span> <span class="ow">or</span> <span class="p">{})</span><span class="o">.</span><span class="n">items</span><span class="p">()}</span> |
| <span class="k">elif</span> <span class="n">args</span> <span class="ow">is</span> <span class="kc">None</span> <span class="ow">or</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">args</span><span class="p">,</span> <span class="n">List</span><span class="p">):</span> |
| <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| <span class="n">litArgs</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">PythonUtils</span><span class="o">.</span><span class="n">toArray</span><span class="p">(</span> |
| <span class="p">[</span><span class="n">_to_java_column</span><span class="p">(</span><span class="n">lit</span><span class="p">(</span><span class="n">v</span><span class="p">))</span> <span class="k">for</span> <span class="n">v</span> <span class="ow">in</span> <span class="p">(</span><span class="n">args</span> <span class="ow">or</span> <span class="p">[])]</span> |
| <span class="p">)</span> |
| <span class="k">else</span><span class="p">:</span> |
| <span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"INVALID_TYPE"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"arg_name"</span><span class="p">:</span> <span class="s2">"args"</span><span class="p">,</span> <span class="s2">"arg_type"</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">args</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span> |
| <span class="p">)</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">sql</span><span class="p">(</span><span class="n">sqlQuery</span><span class="p">,</span> <span class="n">litArgs</span><span class="p">),</span> <span class="bp">self</span><span class="p">)</span> |
| <span class="k">finally</span><span class="p">:</span> |
| <span class="k">if</span> <span class="nb">len</span><span class="p">(</span><span class="n">kwargs</span><span class="p">)</span> <span class="o">></span> <span class="mi">0</span><span class="p">:</span> |
| <span class="n">formatter</span><span class="o">.</span><span class="n">clear</span><span class="p">()</span></div> |
| |
| <div class="viewcode-block" id="SparkSession.table"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.table.html#pyspark.sql.SparkSession.table">[docs]</a> <span class="k">def</span> <span class="nf">table</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tableName</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the specified table as a :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> tableName : str</span> |
| <span class="sd"> the table name to retrieve.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> In Spark Classic, a temporary view referenced in `spark.table` is resolved immediately,</span> |
| <span class="sd"> while in Spark Connect it is lazily analyzed.</span> |
| <span class="sd"> So in Spark Connect if a view is dropped, modified or replaced after `spark.table`, the</span> |
| <span class="sd"> execution may fail or generate different results.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.range(5).createOrReplaceTempView("table1")</span> |
| <span class="sd"> >>> spark.table("table1").sort("id").show()</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | id|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> | 4|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> """</span> |
| <span class="k">if</span> <span class="ow">not</span> <span class="nb">isinstance</span><span class="p">(</span><span class="n">tableName</span><span class="p">,</span> <span class="nb">str</span><span class="p">):</span> |
| <span class="k">raise</span> <span class="n">PySparkTypeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"NOT_STR"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"arg_name"</span><span class="p">:</span> <span class="s2">"tableName"</span><span class="p">,</span> <span class="s2">"arg_type"</span><span class="p">:</span> <span class="nb">type</span><span class="p">(</span><span class="n">tableName</span><span class="p">)</span><span class="o">.</span><span class="vm">__name__</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">table</span><span class="p">(</span><span class="n">tableName</span><span class="p">),</span> <span class="bp">self</span><span class="p">)</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">read</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrameReader</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a :class:`DataFrameReader` that can be used to read data</span> |
| <span class="sd"> in as a :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrameReader`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.read</span> |
| <span class="sd"> <...DataFrameReader object ...></span> |
| |
| <span class="sd"> Write a DataFrame into a JSON file and read it back.</span> |
| |
| <span class="sd"> >>> import tempfile</span> |
| <span class="sd"> >>> with tempfile.TemporaryDirectory(prefix="read") as d:</span> |
| <span class="sd"> ... # Write a DataFrame into a JSON file</span> |
| <span class="sd"> ... spark.createDataFrame(</span> |
| <span class="sd"> ... [{"age": 100, "name": "Hyukjin Kwon"}]</span> |
| <span class="sd"> ... ).write.mode("overwrite").format("json").save(d)</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> ... # Read the JSON file as a DataFrame.</span> |
| <span class="sd"> ... spark.read.format('json').load(d).show()</span> |
| <span class="sd"> +---+------------+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+------------+</span> |
| <span class="sd"> |100|Hyukjin Kwon|</span> |
| <span class="sd"> +---+------------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">DataFrameReader</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">readStream</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataStreamReader</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a :class:`DataStreamReader` that can be used to read data streams</span> |
| <span class="sd"> as a streaming :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.5.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This API is evolving.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataStreamReader`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.readStream</span> |
| <span class="sd"> <pyspark...DataStreamReader object ...></span> |
| |
| <span class="sd"> The example below uses Rate source that generates rows continuously.</span> |
| <span class="sd"> After that, we operate a modulo by 3, and then write the stream out to the console.</span> |
| <span class="sd"> The streaming query stops in 3 seconds.</span> |
| |
| <span class="sd"> >>> import time</span> |
| <span class="sd"> >>> df = spark.readStream.format("rate").load()</span> |
| <span class="sd"> >>> df = df.selectExpr("value % 3 as v")</span> |
| <span class="sd"> >>> q = df.writeStream.format("console").start()</span> |
| <span class="sd"> >>> time.sleep(3)</span> |
| <span class="sd"> >>> q.stop()</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">DataStreamReader</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">streams</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"StreamingQueryManager"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a :class:`StreamingQueryManager` that allows managing all the</span> |
| <span class="sd"> :class:`StreamingQuery` instances active on `this` context.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.5.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This API is evolving.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`StreamingQueryManager`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.streams</span> |
| <span class="sd"> <pyspark...StreamingQueryManager object ...></span> |
| |
| <span class="sd"> Get the list of active streaming queries</span> |
| |
| <span class="sd"> >>> sq = spark.readStream.format(</span> |
| <span class="sd"> ... "rate").load().writeStream.format('memory').queryName('this_query').start()</span> |
| <span class="sd"> >>> sqm = spark.streams</span> |
| <span class="sd"> >>> [q.name for q in sqm.active]</span> |
| <span class="sd"> ['this_query']</span> |
| <span class="sd"> >>> sq.stop()</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.streaming</span> <span class="kn">import</span> <span class="n">StreamingQueryManager</span> |
| |
| <span class="k">if</span> <span class="nb">hasattr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="s2">"_sqm"</span><span class="p">):</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sqm</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_sqm</span><span class="p">:</span> <span class="n">StreamingQueryManager</span> <span class="o">=</span> <span class="n">StreamingQueryManager</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jsparkSession</span><span class="o">.</span><span class="n">streams</span><span class="p">())</span> |
| <span class="k">return</span> <span class="bp">self</span><span class="o">.</span><span class="n">_sqm</span> |
| |
| <div class="viewcode-block" id="SparkSession.stop"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.stop.html#pyspark.sql.SparkSession.stop">[docs]</a> <span class="k">def</span> <span class="nf">stop</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Stop the underlying :class:`SparkContext`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.stop() # doctest: +SKIP</span> |
| <span class="sd"> """</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.context</span> <span class="kn">import</span> <span class="n">SQLContext</span> |
| |
| <span class="bp">self</span><span class="o">.</span><span class="n">_sc</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span> |
| <span class="c1"># We should clean the default session up. See SPARK-23228.</span> |
| <span class="k">assert</span> <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span> <span class="ow">is</span> <span class="ow">not</span> <span class="kc">None</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">clearDefaultSession</span><span class="p">()</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jvm</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">clearActiveSession</span><span class="p">()</span> |
| <span class="n">SparkSession</span><span class="o">.</span><span class="n">_instantiatedSession</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="n">SparkSession</span><span class="o">.</span><span class="n">_activeSession</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="n">SQLContext</span><span class="o">.</span><span class="n">_instantiatedContext</span> <span class="o">=</span> <span class="kc">None</span></div> |
| |
| <span class="k">def</span> <span class="fm">__enter__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"SparkSession"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Enable 'with SparkSession.builder.(...).getOrCreate() as session: app' syntax.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> with SparkSession.builder.master("local").getOrCreate() as session:</span> |
| <span class="sd"> ... session.range(5).show() # doctest: +SKIP</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | id|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> | 4|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="bp">self</span> |
| |
| <span class="k">def</span> <span class="fm">__exit__</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">exc_type</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Type</span><span class="p">[</span><span class="ne">BaseException</span><span class="p">]],</span> |
| <span class="n">exc_val</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="ne">BaseException</span><span class="p">],</span> |
| <span class="n">exc_tb</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">TracebackType</span><span class="p">],</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Enable 'with SparkSession.builder.(...).getOrCreate() as session: app' syntax.</span> |
| |
| <span class="sd"> Specifically stop the SparkSession on exit of the with block.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> with SparkSession.builder.master("local").getOrCreate() as session:</span> |
| <span class="sd"> ... session.range(5).show() # doctest: +SKIP</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | id|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> | 4|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span> |
| |
| <span class="c1"># SparkConnect-specific API</span> |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">client</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"SparkConnectClient"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Gives access to the Spark Connect client. In normal cases this is not necessary to be used</span> |
| <span class="sd"> and only relevant for testing.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`SparkConnectClient`</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This API is unstable, and a developer API. It returns non-API instance</span> |
| <span class="sd"> :class:`SparkConnectClient`.</span> |
| <span class="sd"> This is an API dedicated to Spark Connect client only. With regular Spark Session, it throws</span> |
| <span class="sd"> an exception.</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"ONLY_SUPPORTED_WITH_SPARK_CONNECT"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"SparkSession.client"</span><span class="p">},</span> |
| <span class="p">)</span> |
| |
| <div class="viewcode-block" id="SparkSession.addArtifacts"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.addArtifacts.html#pyspark.sql.SparkSession.addArtifacts">[docs]</a> <span class="k">def</span> <span class="nf">addArtifacts</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">pyfile</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">archive</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> <span class="n">file</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Add artifact(s) to the client session. Currently only local files are supported.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> *path : tuple of str</span> |
| <span class="sd"> Artifact's URIs to add.</span> |
| <span class="sd"> pyfile : bool</span> |
| <span class="sd"> Whether to add them as Python dependencies such as .py, .egg, .zip or .jar files.</span> |
| <span class="sd"> The pyfiles are directly inserted into the path when executing Python functions</span> |
| <span class="sd"> in executors.</span> |
| <span class="sd"> archive : bool</span> |
| <span class="sd"> Whether to add them as archives such as .zip, .jar, .tar.gz, .tgz, or .tar files.</span> |
| <span class="sd"> The archives are unpacked on the executor side automatically.</span> |
| <span class="sd"> file : bool</span> |
| <span class="sd"> Add a file to be downloaded with this Spark job on every node.</span> |
| <span class="sd"> The ``path`` passed can only be a local file for now.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This is an API dedicated to Spark Connect client only. With regular Spark Session, it throws</span> |
| <span class="sd"> an exception.</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"ONLY_SUPPORTED_WITH_SPARK_CONNECT"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"SparkSession.addArtifact(s)"</span><span class="p">},</span> |
| <span class="p">)</span></div> |
| |
| <span class="n">addArtifact</span> <span class="o">=</span> <span class="n">addArtifacts</span> |
| |
| <div class="viewcode-block" id="SparkSession.registerProgressHandler"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.registerProgressHandler.html#pyspark.sql.SparkSession.registerProgressHandler">[docs]</a> <span class="k">def</span> <span class="nf">registerProgressHandler</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">handler</span><span class="p">:</span> <span class="s2">"ProgressHandler"</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Register a progress handler to be called when a progress update is received from the server.</span> |
| |
| <span class="sd"> .. versionadded:: 4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> handler : ProgressHandler</span> |
| <span class="sd"> A callable that follows the ProgressHandler interface. This handler will be called</span> |
| <span class="sd"> on every progress update.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| |
| <span class="sd"> >>> def progress_handler(stages, inflight_tasks, done):</span> |
| <span class="sd"> ... print(f"{len(stages)} Stages known, Done: {done}")</span> |
| <span class="sd"> >>> spark.registerProgressHandler(progress_handler)</span> |
| <span class="sd"> >>> res = spark.range(10).repartition(1).collect() # doctest: +SKIP</span> |
| <span class="sd"> 3 Stages known, Done: False</span> |
| <span class="sd"> 3 Stages known, Done: True</span> |
| <span class="sd"> >>> spark.clearProgressHandlers()</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"ONLY_SUPPORTED_WITH_SPARK_CONNECT"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"SparkSession.registerProgressHandler"</span><span class="p">},</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="SparkSession.removeProgressHandler"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.removeProgressHandler.html#pyspark.sql.SparkSession.removeProgressHandler">[docs]</a> <span class="k">def</span> <span class="nf">removeProgressHandler</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">handler</span><span class="p">:</span> <span class="s2">"ProgressHandler"</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Remove a progress handler that was previously registered.</span> |
| |
| <span class="sd"> .. versionadded:: 4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> handler : ProgressHandler</span> |
| <span class="sd"> The handler to remove if present in the list of progress handlers.</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"ONLY_SUPPORTED_WITH_SPARK_CONNECT"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"SparkSession.removeProgressHandler"</span><span class="p">},</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="SparkSession.clearProgressHandlers"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.clearProgressHandlers.html#pyspark.sql.SparkSession.clearProgressHandlers">[docs]</a> <span class="k">def</span> <span class="nf">clearProgressHandlers</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Clear all registered progress handlers.</span> |
| |
| <span class="sd"> .. versionadded:: 4.0</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"ONLY_SUPPORTED_WITH_SPARK_CONNECT"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"SparkSession.clearProgressHandlers"</span><span class="p">},</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="SparkSession.copyFromLocalToFs"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.copyFromLocalToFs.html#pyspark.sql.SparkSession.copyFromLocalToFs">[docs]</a> <span class="k">def</span> <span class="nf">copyFromLocalToFs</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">local_path</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">dest_path</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Copy file from local to cloud storage file system.</span> |
| <span class="sd"> If the file already exits in destination path, old file is overwritten.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> local_path: str</span> |
| <span class="sd"> Path to a local file. Directories are not supported.</span> |
| <span class="sd"> The path can be either an absolute path or a relative path.</span> |
| <span class="sd"> dest_path: str</span> |
| <span class="sd"> The cloud storage path to the destination the file will</span> |
| <span class="sd"> be copied to.</span> |
| <span class="sd"> The path must be an an absolute path.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This API is a developer API.</span> |
| <span class="sd"> Also, this is an API dedicated to Spark Connect client only. With regular</span> |
| <span class="sd"> Spark Session, it throws an exception.</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"ONLY_SUPPORTED_WITH_SPARK_CONNECT"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"SparkSession.copyFromLocalToFs"</span><span class="p">},</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="SparkSession.interruptAll"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.interruptAll.html#pyspark.sql.SparkSession.interruptAll">[docs]</a> <span class="k">def</span> <span class="nf">interruptAll</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Interrupt all operations of this session currently running on the connected server.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> list of str</span> |
| <span class="sd"> List of operationIds of interrupted operations.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> There is still a possibility of operation finishing just as it is interrupted.</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"ONLY_SUPPORTED_WITH_SPARK_CONNECT"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"SparkSession.interruptAll"</span><span class="p">},</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="SparkSession.interruptTag"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.interruptTag.html#pyspark.sql.SparkSession.interruptTag">[docs]</a> <span class="k">def</span> <span class="nf">interruptTag</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tag</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Interrupt all operations of this session with the given operation tag.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> list of str</span> |
| <span class="sd"> List of operationIds of interrupted operations.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> There is still a possibility of operation finishing just as it is interrupted.</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"ONLY_SUPPORTED_WITH_SPARK_CONNECT"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"SparkSession.interruptTag"</span><span class="p">},</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="SparkSession.interruptOperation"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.interruptOperation.html#pyspark.sql.SparkSession.interruptOperation">[docs]</a> <span class="k">def</span> <span class="nf">interruptOperation</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">op_id</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Interrupt an operation of this session with the given operationId.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> list of str</span> |
| <span class="sd"> List of operationIds of interrupted operations.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> There is still a possibility of operation finishing just as it is interrupted.</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"ONLY_SUPPORTED_WITH_SPARK_CONNECT"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"SparkSession.interruptOperation"</span><span class="p">},</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="SparkSession.addTag"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.addTag.html#pyspark.sql.SparkSession.addTag">[docs]</a> <span class="k">def</span> <span class="nf">addTag</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tag</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Add a tag to be assigned to all the operations started by this thread in this session.</span> |
| |
| <span class="sd"> Often, a unit of execution in an application consists of multiple Spark executions.</span> |
| <span class="sd"> Application programmers can use this method to group all those jobs together and give a</span> |
| <span class="sd"> group tag. The application can use :meth:`SparkSession.interruptTag` to cancel all running</span> |
| <span class="sd"> executions with this tag.</span> |
| |
| <span class="sd"> There may be multiple tags present at the same time, so different parts of application may</span> |
| <span class="sd"> use different tags to perform cancellation at different levels of granularity.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> tag : str</span> |
| <span class="sd"> The tag to be added. Cannot contain ',' (comma) character or be an empty string.</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"ONLY_SUPPORTED_WITH_SPARK_CONNECT"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"SparkSession.addTag"</span><span class="p">},</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="SparkSession.removeTag"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.removeTag.html#pyspark.sql.SparkSession.removeTag">[docs]</a> <span class="k">def</span> <span class="nf">removeTag</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">tag</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Remove a tag previously added to be assigned to all the operations started by this thread in</span> |
| <span class="sd"> this session. Noop if such a tag was not added earlier.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> tag : list of str</span> |
| <span class="sd"> The tag to be removed. Cannot contain ',' (comma) character or be an empty string.</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"ONLY_SUPPORTED_WITH_SPARK_CONNECT"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"SparkSession.removeTag"</span><span class="p">},</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="SparkSession.getTags"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.getTags.html#pyspark.sql.SparkSession.getTags">[docs]</a> <span class="k">def</span> <span class="nf">getTags</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Set</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Get the tags that are currently set to be assigned to all the operations started by this</span> |
| <span class="sd"> thread.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> set of str</span> |
| <span class="sd"> Set of tags of interrupted operations.</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"ONLY_SUPPORTED_WITH_SPARK_CONNECT"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"SparkSession.getTags"</span><span class="p">},</span> |
| <span class="p">)</span></div> |
| |
| <div class="viewcode-block" id="SparkSession.clearTags"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.SparkSession.clearTags.html#pyspark.sql.SparkSession.clearTags">[docs]</a> <span class="k">def</span> <span class="nf">clearTags</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Clear the current thread's operation tags.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| <span class="sd"> """</span> |
| <span class="k">raise</span> <span class="n">PySparkRuntimeError</span><span class="p">(</span> |
| <span class="n">errorClass</span><span class="o">=</span><span class="s2">"ONLY_SUPPORTED_WITH_SPARK_CONNECT"</span><span class="p">,</span> |
| <span class="n">messageParameters</span><span class="o">=</span><span class="p">{</span><span class="s2">"feature"</span><span class="p">:</span> <span class="s2">"SparkSession.clearTags"</span><span class="p">},</span> |
| <span class="p">)</span></div></div> |
| |
| |
| <span class="k">def</span> <span class="nf">_test</span><span class="p">()</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="kn">import</span> <span class="nn">os</span> |
| <span class="kn">import</span> <span class="nn">doctest</span> |
| <span class="kn">import</span> <span class="nn">pyspark.sql.session</span> |
| |
| <span class="n">os</span><span class="o">.</span><span class="n">chdir</span><span class="p">(</span><span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s2">"SPARK_HOME"</span><span class="p">])</span> |
| |
| <span class="c1"># Disable Doc Tests for Spark Connect only functions:</span> |
| <span class="n">pyspark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">session</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">registerProgressHandler</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="n">pyspark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">session</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">removeProgressHandler</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="n">pyspark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">session</span><span class="o">.</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">clearProgressHandlers</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="kc">None</span> |
| |
| <span class="n">globs</span> <span class="o">=</span> <span class="n">pyspark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">session</span><span class="o">.</span><span class="vm">__dict__</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"spark"</span><span class="p">]</span> <span class="o">=</span> <span class="p">(</span> |
| <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">master</span><span class="p">(</span><span class="s2">"local[4]"</span><span class="p">)</span><span class="o">.</span><span class="n">appName</span><span class="p">(</span><span class="s2">"sql.session tests"</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| <span class="p">)</span> |
| <span class="p">(</span><span class="n">failure_count</span><span class="p">,</span> <span class="n">test_count</span><span class="p">)</span> <span class="o">=</span> <span class="n">doctest</span><span class="o">.</span><span class="n">testmod</span><span class="p">(</span> |
| <span class="n">pyspark</span><span class="o">.</span><span class="n">sql</span><span class="o">.</span><span class="n">session</span><span class="p">,</span> |
| <span class="n">globs</span><span class="o">=</span><span class="n">globs</span><span class="p">,</span> |
| <span class="n">optionflags</span><span class="o">=</span><span class="n">doctest</span><span class="o">.</span><span class="n">ELLIPSIS</span> <span class="o">|</span> <span class="n">doctest</span><span class="o">.</span><span class="n">NORMALIZE_WHITESPACE</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="n">globs</span><span class="p">[</span><span class="s2">"spark"</span><span class="p">]</span><span class="o">.</span><span class="n">stop</span><span class="p">()</span> |
| <span class="k">if</span> <span class="n">failure_count</span><span class="p">:</span> |
| <span class="n">sys</span><span class="o">.</span><span class="n">exit</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">)</span> |
| |
| |
| <span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span> |
| <span class="n">_test</span><span class="p">()</span> |
| </pre></div> |
| |
| </article> |
| |
| |
| |
| <footer class="bd-footer-article"> |
| |
| <div class="footer-article-items footer-article__inner"> |
| |
| <div class="footer-article-item"><!-- Previous / next buttons --> |
| <div class="prev-next-area"> |
| </div></div> |
| |
| </div> |
| |
| </footer> |
| |
| </div> |
| |
| |
| |
| |
| </div> |
| <footer class="bd-footer-content"> |
| |
| </footer> |
| |
| </main> |
| </div> |
| </div> |
| |
| <!-- Scripts loaded after <body> so the DOM is not blocked --> |
| <script src="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script> |
| <script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script> |
| |
| <footer class="bd-footer"> |
| <div class="bd-footer__inner bd-page-width"> |
| |
| <div class="footer-items__start"> |
| |
| <div class="footer-item"><p class="copyright"> |
| Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. |
| </p></div> |
| |
| <div class="footer-item"> |
| <p class="sphinx-version"> |
| Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0. |
| <br/> |
| </p> |
| </div> |
| |
| </div> |
| |
| |
| <div class="footer-items__end"> |
| |
| <div class="footer-item"><p class="theme-version"> |
| Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3. |
| </p></div> |
| |
| </div> |
| |
| </div> |
| |
| </footer> |
| </body> |
| </html> |