| |
| |
| <!DOCTYPE html> |
| |
| |
| <html > |
| |
| <head> |
| <meta charset="utf-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /> |
| <title>pyspark.sql.dataframe — PySpark 4.0.0-preview1 documentation</title> |
| |
| |
| |
| <script data-cfasync="false"> |
| document.documentElement.dataset.mode = localStorage.getItem("mode") || ""; |
| document.documentElement.dataset.theme = localStorage.getItem("theme") || "light"; |
| </script> |
| |
| <!-- Loaded before other Sphinx assets --> |
| <link href="../../../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../../../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../../../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| |
| |
| <link href="../../../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../../../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" /> |
| |
| <link rel="stylesheet" type="text/css" href="../../../_static/pygments.css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/copybutton.css" /> |
| <link rel="stylesheet" type="text/css" href="../../../_static/css/pyspark.css" /> |
| |
| <!-- Pre-loaded scripts that we'll load fully later --> |
| <link rel="preload" as="script" href="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52" /> |
| <link rel="preload" as="script" href="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" /> |
| |
| <script data-url_root="../../../" id="documentation_options" src="../../../_static/documentation_options.js"></script> |
| <script src="../../../_static/jquery.js"></script> |
| <script src="../../../_static/underscore.js"></script> |
| <script src="../../../_static/doctools.js"></script> |
| <script src="../../../_static/clipboard.min.js"></script> |
| <script src="../../../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script>DOCUMENTATION_OPTIONS.pagename = '_modules/pyspark/sql/dataframe';</script> |
| <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/_modules/pyspark/sql/dataframe.html" /> |
| <link rel="search" title="Search" href="../../../search.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="None"> |
| |
| |
| <!-- Matomo --> |
| <script type="text/javascript"> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| </head> |
| |
| |
| <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode=""> |
| |
| |
| |
| <a class="skip-link" href="#main-content">Skip to main content</a> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__primary" |
| id="__primary"/> |
| <label class="overlay overlay-primary" for="__primary"></label> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__secondary" |
| id="__secondary"/> |
| <label class="overlay overlay-secondary" for="__secondary"></label> |
| |
| <div class="search-button__wrapper"> |
| <div class="search-button__overlay"></div> |
| <div class="search-button__search-container"> |
| <form class="bd-search d-flex align-items-center" |
| action="../../../search.html" |
| method="get"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| <input type="search" |
| class="form-control" |
| name="q" |
| id="search-input" |
| placeholder="Search the docs ..." |
| aria-label="Search the docs ..." |
| autocomplete="off" |
| autocorrect="off" |
| autocapitalize="off" |
| spellcheck="false"/> |
| <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span> |
| </form></div> |
| </div> |
| |
| <nav class="bd-header navbar navbar-expand-lg bd-navbar"> |
| <div class="bd-header__inner bd-page-width"> |
| <label class="sidebar-toggle primary-toggle" for="__primary"> |
| <span class="fa-solid fa-bars"></span> |
| </label> |
| |
| <div class="navbar-header-items__start"> |
| |
| <div class="navbar-item"> |
| |
| |
| <a class="navbar-brand logo" href="../../../index.html"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <img src="../../../_static/spark-logo-light.png" class="logo__image only-light" alt="Logo image"/> |
| <script>document.write(`<img src="../../../_static/spark-logo-dark.png" class="logo__image only-dark" alt="Logo image"/>`);</script> |
| |
| |
| </a></div> |
| |
| </div> |
| |
| |
| <div class="col-lg-9 navbar-header-items"> |
| |
| <div class="me-auto navbar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| <div class="navbar-header-items__end"> |
| |
| <div class="navbar-item navbar-persistent--container"> |
| |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview1 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "_modules/pyspark/sql/dataframe.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="navbar-persistent--mobile"> |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| |
| </div> |
| |
| </nav> |
| |
| <div class="bd-container"> |
| <div class="bd-container__inner bd-page-width"> |
| |
| <div class="bd-sidebar-primary bd-sidebar hide-on-wide"> |
| |
| |
| |
| <div class="sidebar-header-items sidebar-primary__section"> |
| |
| |
| <div class="sidebar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../../../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| |
| <div class="sidebar-header-items__end"> |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview1 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "_modules/pyspark/sql/dataframe.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="sidebar-primary-items__end sidebar-primary__section"> |
| </div> |
| |
| <div id="rtd-footer-container"></div> |
| |
| |
| </div> |
| |
| <main id="main-content" class="bd-main"> |
| |
| |
| <div class="bd-content"> |
| <div class="bd-article-container"> |
| |
| <div class="bd-header-article"> |
| <div class="header-article-items header-article__inner"> |
| |
| <div class="header-article-items__start"> |
| |
| <div class="header-article-item"> |
| |
| |
| |
| <nav aria-label="Breadcrumbs"> |
| <ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb"> |
| |
| <li class="breadcrumb-item breadcrumb-home"> |
| <a href="../../../index.html" class="nav-link" aria-label="Home"> |
| <i class="fa-solid fa-home"></i> |
| </a> |
| </li> |
| |
| <li class="breadcrumb-item"><a href="../../index.html" class="nav-link">Module code</a></li> |
| |
| <li class="breadcrumb-item active" aria-current="page">pyspark.sql.dataframe</li> |
| </ul> |
| </nav> |
| </div> |
| |
| </div> |
| |
| |
| </div> |
| </div> |
| |
| |
| |
| |
| <div id="searchbox"></div> |
| <article class="bd-article" role="main"> |
| |
| <h1>Source code for pyspark.sql.dataframe</h1><div class="highlight"><pre> |
| <span></span><span class="c1">#</span> |
| <span class="c1"># Licensed to the Apache Software Foundation (ASF) under one or more</span> |
| <span class="c1"># contributor license agreements. See the NOTICE file distributed with</span> |
| <span class="c1"># this work for additional information regarding copyright ownership.</span> |
| <span class="c1"># The ASF licenses this file to You under the Apache License, Version 2.0</span> |
| <span class="c1"># (the "License"); you may not use this file except in compliance with</span> |
| <span class="c1"># the License. You may obtain a copy of the License at</span> |
| <span class="c1">#</span> |
| <span class="c1"># http://www.apache.org/licenses/LICENSE-2.0</span> |
| <span class="c1">#</span> |
| <span class="c1"># Unless required by applicable law or agreed to in writing, software</span> |
| <span class="c1"># distributed under the License is distributed on an "AS IS" BASIS,</span> |
| <span class="c1"># WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.</span> |
| <span class="c1"># See the License for the specific language governing permissions and</span> |
| <span class="c1"># limitations under the License.</span> |
| <span class="c1">#</span> |
| |
| <span class="c1"># mypy: disable-error-code="empty-body"</span> |
| |
| <span class="kn">from</span> <span class="nn">typing</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">Any</span><span class="p">,</span> |
| <span class="n">Callable</span><span class="p">,</span> |
| <span class="n">Dict</span><span class="p">,</span> |
| <span class="n">Iterator</span><span class="p">,</span> |
| <span class="n">List</span><span class="p">,</span> |
| <span class="n">Optional</span><span class="p">,</span> |
| <span class="n">Sequence</span><span class="p">,</span> |
| <span class="n">Tuple</span><span class="p">,</span> |
| <span class="n">Union</span><span class="p">,</span> |
| <span class="n">overload</span><span class="p">,</span> |
| <span class="n">TYPE_CHECKING</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark</span> <span class="kn">import</span> <span class="n">_NoValue</span> |
| <span class="kn">from</span> <span class="nn">pyspark._globals</span> <span class="kn">import</span> <span class="n">_NoValueType</span> |
| <span class="kn">from</span> <span class="nn">pyspark.util</span> <span class="kn">import</span> <span class="n">is_remote_only</span> |
| <span class="kn">from</span> <span class="nn">pyspark.storagelevel</span> <span class="kn">import</span> <span class="n">StorageLevel</span> |
| <span class="kn">from</span> <span class="nn">pyspark.resource</span> <span class="kn">import</span> <span class="n">ResourceProfile</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.column</span> <span class="kn">import</span> <span class="n">Column</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.readwriter</span> <span class="kn">import</span> <span class="n">DataFrameWriter</span><span class="p">,</span> <span class="n">DataFrameWriterV2</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.streaming</span> <span class="kn">import</span> <span class="n">DataStreamWriter</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.types</span> <span class="kn">import</span> <span class="n">StructType</span><span class="p">,</span> <span class="n">Row</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.utils</span> <span class="kn">import</span> <span class="n">dispatch_df_method</span> |
| |
| <span class="k">if</span> <span class="n">TYPE_CHECKING</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">py4j.java_gateway</span> <span class="kn">import</span> <span class="n">JavaObject</span> |
| <span class="kn">import</span> <span class="nn">pyarrow</span> <span class="k">as</span> <span class="nn">pa</span> |
| <span class="kn">from</span> <span class="nn">pyspark.core.context</span> <span class="kn">import</span> <span class="n">SparkContext</span> |
| <span class="kn">from</span> <span class="nn">pyspark.core.rdd</span> <span class="kn">import</span> <span class="n">RDD</span> |
| <span class="kn">from</span> <span class="nn">pyspark._typing</span> <span class="kn">import</span> <span class="n">PrimitiveType</span> |
| <span class="kn">from</span> <span class="nn">pyspark.pandas.frame</span> <span class="kn">import</span> <span class="n">DataFrame</span> <span class="k">as</span> <span class="n">PandasOnSparkDataFrame</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql._typing</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">ColumnOrName</span><span class="p">,</span> |
| <span class="n">ColumnOrNameOrOrdinal</span><span class="p">,</span> |
| <span class="n">LiteralType</span><span class="p">,</span> |
| <span class="n">OptionalPrimitiveType</span><span class="p">,</span> |
| <span class="p">)</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.context</span> <span class="kn">import</span> <span class="n">SQLContext</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.session</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.group</span> <span class="kn">import</span> <span class="n">GroupedData</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.observation</span> <span class="kn">import</span> <span class="n">Observation</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.pandas._typing</span> <span class="kn">import</span> <span class="p">(</span> |
| <span class="n">PandasMapIterFunction</span><span class="p">,</span> |
| <span class="n">ArrowMapIterFunction</span><span class="p">,</span> |
| <span class="n">DataFrameLike</span> <span class="k">as</span> <span class="n">PandasDataFrameLike</span><span class="p">,</span> |
| <span class="p">)</span> |
| |
| |
| <span class="n">__all__</span> <span class="o">=</span> <span class="p">[</span><span class="s2">"DataFrame"</span><span class="p">,</span> <span class="s2">"DataFrameNaFunctions"</span><span class="p">,</span> <span class="s2">"DataFrameStatFunctions"</span><span class="p">]</span> |
| |
| |
| <div class="viewcode-block" id="DataFrame"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.html#pyspark.sql.DataFrame">[docs]</a><span class="k">class</span> <span class="nc">DataFrame</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""A distributed collection of data grouped into named columns.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> A :class:`DataFrame` is equivalent to a relational table in Spark SQL,</span> |
| <span class="sd"> and can be created using various functions in :class:`SparkSession`:</span> |
| |
| <span class="sd"> >>> people = spark.createDataFrame([</span> |
| <span class="sd"> ... {"deptId": 1, "age": 40, "name": "Hyukjin Kwon", "gender": "M", "salary": 50},</span> |
| <span class="sd"> ... {"deptId": 1, "age": 50, "name": "Takuya Ueshin", "gender": "M", "salary": 100},</span> |
| <span class="sd"> ... {"deptId": 2, "age": 60, "name": "Xinrong Meng", "gender": "F", "salary": 150},</span> |
| <span class="sd"> ... {"deptId": 3, "age": 20, "name": "Haejoon Lee", "gender": "M", "salary": 200}</span> |
| <span class="sd"> ... ])</span> |
| |
| <span class="sd"> Once created, it can be manipulated using the various domain-specific-language</span> |
| <span class="sd"> (DSL) functions defined in: :class:`DataFrame`, :class:`Column`.</span> |
| |
| <span class="sd"> To select a column from the :class:`DataFrame`, use the apply method:</span> |
| |
| <span class="sd"> >>> age_col = people.age</span> |
| |
| <span class="sd"> A more concrete example:</span> |
| |
| <span class="sd"> >>> # To create DataFrame using SparkSession</span> |
| <span class="sd"> ... department = spark.createDataFrame([</span> |
| <span class="sd"> ... {"id": 1, "name": "PySpark"},</span> |
| <span class="sd"> ... {"id": 2, "name": "ML"},</span> |
| <span class="sd"> ... {"id": 3, "name": "Spark SQL"}</span> |
| <span class="sd"> ... ])</span> |
| |
| <span class="sd"> >>> people.filter(people.age > 30).join(</span> |
| <span class="sd"> ... department, people.deptId == department.id).groupBy(</span> |
| <span class="sd"> ... department.name, "gender").agg(</span> |
| <span class="sd"> ... {"salary": "avg", "age": "max"}).sort("max(age)").show()</span> |
| <span class="sd"> +-------+------+-----------+--------+</span> |
| <span class="sd"> | name|gender|avg(salary)|max(age)|</span> |
| <span class="sd"> +-------+------+-----------+--------+</span> |
| <span class="sd"> |PySpark| M| 75.0| 50|</span> |
| <span class="sd"> | ML| F| 150.0| 60|</span> |
| <span class="sd"> +-------+------+-----------+--------+</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> A DataFrame should only be created as described above. It should not be directly</span> |
| <span class="sd"> created via using the constructor.</span> |
| <span class="sd"> """</span> |
| |
| <span class="c1"># HACK ALERT!! this is to reduce the backward compatibility concern, and returns</span> |
| <span class="c1"># Spark Classic DataFrame by default. This is NOT an API, and NOT supposed to</span> |
| <span class="c1"># be directly invoked. DO NOT use this constructor.</span> |
| <span class="n">_sql_ctx</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"SQLContext"</span><span class="p">]</span> |
| <span class="n">_session</span><span class="p">:</span> <span class="s2">"SparkSession"</span> |
| <span class="n">_sc</span><span class="p">:</span> <span class="s2">"SparkContext"</span> |
| <span class="n">_jdf</span><span class="p">:</span> <span class="s2">"JavaObject"</span> |
| <span class="n">is_cached</span><span class="p">:</span> <span class="nb">bool</span> |
| <span class="n">_schema</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">StructType</span><span class="p">]</span> |
| <span class="n">_lazy_rdd</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="s2">"RDD[Row]"</span><span class="p">]</span> |
| <span class="n">_support_repr_html</span><span class="p">:</span> <span class="nb">bool</span> |
| |
| <span class="k">def</span> <span class="fm">__new__</span><span class="p">(</span> |
| <span class="bp">cls</span><span class="p">,</span> |
| <span class="n">jdf</span><span class="p">:</span> <span class="s2">"JavaObject"</span><span class="p">,</span> |
| <span class="n">sql_ctx</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"SQLContext"</span><span class="p">,</span> <span class="s2">"SparkSession"</span><span class="p">],</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.classic.dataframe</span> <span class="kn">import</span> <span class="n">DataFrame</span> |
| |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="o">.</span><span class="fm">__new__</span><span class="p">(</span><span class="n">DataFrame</span><span class="p">,</span> <span class="n">jdf</span><span class="p">,</span> <span class="n">sql_ctx</span><span class="p">)</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">sparkSession</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"SparkSession"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns Spark session that created this :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`SparkSession`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> type(df.sparkSession)</span> |
| <span class="sd"> <class '...session.SparkSession'></span> |
| <span class="sd"> """</span> |
| <span class="o">...</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_remote_only</span><span class="p">():</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">rdd</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[Row]"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the content as an :class:`pyspark.RDD` of :class:`Row`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`RDD`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> type(df.rdd)</span> |
| <span class="sd"> <class 'pyspark.core.rdd.RDD'></span> |
| <span class="sd"> """</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">na</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrameNaFunctions"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a :class:`DataFrameNaFunctions` for handling missing values.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.1</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrameNaFunctions`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.sql("SELECT 1 AS c1, int(NULL) AS c2")</span> |
| <span class="sd"> >>> type(df.na)</span> |
| <span class="sd"> <class '...dataframe.DataFrameNaFunctions'></span> |
| |
| <span class="sd"> Replace the missing values as 2.</span> |
| |
| <span class="sd"> >>> df.na.fill(2).show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | c1| c2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | 1| 2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">stat</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrameStatFunctions"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a :class:`DataFrameStatFunctions` for statistic functions.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrameStatFunctions`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyspark.sql.functions as f</span> |
| <span class="sd"> >>> df = spark.range(3).withColumn("c", f.expr("id + 1"))</span> |
| <span class="sd"> >>> type(df.stat)</span> |
| <span class="sd"> <class '...dataframe.DataFrameStatFunctions'></span> |
| <span class="sd"> >>> df.stat.corr("id", "c")</span> |
| <span class="sd"> 1.0</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span> |
| |
| <span class="k">if</span> <span class="ow">not</span> <span class="n">is_remote_only</span><span class="p">():</span> |
| |
| <div class="viewcode-block" id="DataFrame.toJSON"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.toJSON.html#pyspark.sql.DataFrame.toJSON">[docs]</a> <span class="k">def</span> <span class="nf">toJSON</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">use_unicode</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"RDD[str]"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Converts a :class:`DataFrame` into a :class:`RDD` of string.</span> |
| |
| <span class="sd"> Each row is turned into a JSON document as one element in the returned RDD.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> use_unicode : bool, optional, default True</span> |
| <span class="sd"> Whether to convert to unicode or not.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`RDD`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| <span class="sd"> >>> df.toJSON().first()</span> |
| <span class="sd"> '{"age":2,"name":"Alice"}'</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.registerTempTable"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.registerTempTable.html#pyspark.sql.DataFrame.registerTempTable">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">registerTempTable</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Registers this :class:`DataFrame` as a temporary table using the given name.</span> |
| |
| <span class="sd"> The lifetime of this temporary table is tied to the :class:`SparkSession`</span> |
| <span class="sd"> that was used to create this :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> .. deprecated:: 2.0.0</span> |
| <span class="sd"> Use :meth:`DataFrame.createOrReplaceTempView` instead.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> name : str</span> |
| <span class="sd"> Name of the temporary table to register.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| <span class="sd"> >>> df.registerTempTable("people")</span> |
| <span class="sd"> >>> df2 = spark.sql("SELECT * FROM people")</span> |
| <span class="sd"> >>> sorted(df.collect()) == sorted(df2.collect())</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> spark.catalog.dropTempView("people")</span> |
| <span class="sd"> True</span> |
| |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.createTempView"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.createTempView.html#pyspark.sql.DataFrame.createTempView">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">createTempView</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Creates a local temporary view with this :class:`DataFrame`.</span> |
| |
| <span class="sd"> The lifetime of this temporary table is tied to the :class:`SparkSession`</span> |
| <span class="sd"> that was used to create this :class:`DataFrame`.</span> |
| <span class="sd"> throws :class:`TempTableAlreadyExistsException`, if the view name already exists in the</span> |
| <span class="sd"> catalog.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> name : str</span> |
| <span class="sd"> Name of the view.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Example 1: Creating and querying a local temporary view</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| <span class="sd"> >>> df.createTempView("people")</span> |
| <span class="sd"> >>> spark.sql("SELECT * FROM people").show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> Example 2: Attempting to create a temporary view with an existing name</span> |
| |
| <span class="sd"> >>> df.createTempView("people") # doctest: +IGNORE_EXCEPTION_DETAIL</span> |
| <span class="sd"> Traceback (most recent call last):</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> AnalysisException: "Temporary table 'people' already exists;"</span> |
| |
| <span class="sd"> Example 3: Creating and dropping a local temporary view</span> |
| |
| <span class="sd"> >>> spark.catalog.dropTempView("people")</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> df.createTempView("people")</span> |
| |
| <span class="sd"> Example 4: Creating temporary views with multiple DataFrames with</span> |
| <span class="sd"> :meth:`SparkSession.table`</span> |
| |
| <span class="sd"> >>> df1 = spark.createDataFrame([(1, "John"), (2, "Jane")], schema=["id", "name"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([(3, "Jake"), (4, "Jill")], schema=["id", "name"])</span> |
| <span class="sd"> >>> df1.createTempView("table1")</span> |
| <span class="sd"> >>> df2.createTempView("table2")</span> |
| <span class="sd"> >>> result_df = spark.table("table1").union(spark.table("table2"))</span> |
| <span class="sd"> >>> result_df.show()</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> | id|name|</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> | 1|John|</span> |
| <span class="sd"> | 2|Jane|</span> |
| <span class="sd"> | 3|Jake|</span> |
| <span class="sd"> | 4|Jill|</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.createOrReplaceTempView"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.createOrReplaceTempView.html#pyspark.sql.DataFrame.createOrReplaceTempView">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">createOrReplaceTempView</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Creates or replaces a local temporary view with this :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> name : str</span> |
| <span class="sd"> Name of the view.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The lifetime of this temporary table is tied to the :class:`SparkSession`</span> |
| <span class="sd"> that was used to create this :class:`DataFrame`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Example 1: Creating a local temporary view named 'people'.</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| <span class="sd"> >>> df.createOrReplaceTempView("people")</span> |
| |
| <span class="sd"> Example 2: Replacing the local temporary view.</span> |
| |
| <span class="sd"> >>> df2 = df.filter(df.age > 3)</span> |
| <span class="sd"> >>> # Replace the local temporary view with the filtered DataFrame</span> |
| <span class="sd"> >>> df2.createOrReplaceTempView("people")</span> |
| <span class="sd"> >>> # Query the temporary view</span> |
| <span class="sd"> >>> df3 = spark.sql("SELECT * FROM people")</span> |
| <span class="sd"> >>> # Check if the DataFrames are equal</span> |
| <span class="sd"> ... assert sorted(df3.collect()) == sorted(df2.collect())</span> |
| |
| <span class="sd"> Example 3: Dropping the temporary view.</span> |
| |
| <span class="sd"> >>> # Drop the local temporary view</span> |
| <span class="sd"> ... spark.catalog.dropTempView("people")</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.createGlobalTempView"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.createGlobalTempView.html#pyspark.sql.DataFrame.createGlobalTempView">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">createGlobalTempView</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Creates a global temporary view with this :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> name : str</span> |
| <span class="sd"> Name of the view.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The lifetime of this temporary view is tied to this Spark application.</span> |
| <span class="sd"> throws :class:`TempTableAlreadyExistsException`, if the view name already exists in the</span> |
| <span class="sd"> catalog.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Example 1: Creating and querying a global temporary view</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| <span class="sd"> >>> df.createGlobalTempView("people")</span> |
| <span class="sd"> >>> df2 = spark.sql("SELECT * FROM global_temp.people")</span> |
| <span class="sd"> >>> df2.show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> Example 2: Attempting to create a duplicate global temporary view</span> |
| |
| <span class="sd"> >>> df.createGlobalTempView("people") # doctest: +IGNORE_EXCEPTION_DETAIL</span> |
| <span class="sd"> Traceback (most recent call last):</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> AnalysisException: "Temporary table 'people' already exists;"</span> |
| |
| <span class="sd"> Example 3: Dropping a global temporary view</span> |
| |
| <span class="sd"> >>> spark.catalog.dropGlobalTempView("people")</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.createOrReplaceGlobalTempView"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.createOrReplaceGlobalTempView.html#pyspark.sql.DataFrame.createOrReplaceGlobalTempView">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">createOrReplaceGlobalTempView</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Creates or replaces a global temporary view using the given name.</span> |
| |
| <span class="sd"> The lifetime of this temporary view is tied to this Spark application.</span> |
| |
| <span class="sd"> .. versionadded:: 2.2.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> name : str</span> |
| <span class="sd"> Name of the view.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Example 1: Creating a global temporary view with a DataFrame</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| <span class="sd"> >>> df.createOrReplaceGlobalTempView("people")</span> |
| |
| <span class="sd"> Example 2: Replacing a global temporary view with a filtered DataFrame</span> |
| |
| <span class="sd"> >>> df2 = df.filter(df.age > 3)</span> |
| <span class="sd"> >>> df2.createOrReplaceGlobalTempView("people")</span> |
| <span class="sd"> >>> df3 = spark.table("global_temp.people")</span> |
| <span class="sd"> >>> sorted(df3.collect()) == sorted(df2.collect())</span> |
| <span class="sd"> True</span> |
| |
| <span class="sd"> Example 3: Dropping a global temporary view</span> |
| <span class="sd"> >>> spark.catalog.dropGlobalTempView("people")</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">write</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrameWriter</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Interface for saving the content of the non-streaming :class:`DataFrame` out into external</span> |
| <span class="sd"> storage.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrameWriter`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| <span class="sd"> >>> type(df.write)</span> |
| <span class="sd"> <class '...readwriter.DataFrameWriter'></span> |
| |
| <span class="sd"> Write the DataFrame as a table.</span> |
| |
| <span class="sd"> >>> _ = spark.sql("DROP TABLE IF EXISTS tab2")</span> |
| <span class="sd"> >>> df.write.saveAsTable("tab2")</span> |
| <span class="sd"> >>> _ = spark.sql("DROP TABLE tab2")</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">writeStream</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataStreamWriter</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Interface for saving the content of the streaming :class:`DataFrame` out into external</span> |
| <span class="sd"> storage.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.5.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This API is evolving.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataStreamWriter`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import time</span> |
| <span class="sd"> >>> import tempfile</span> |
| <span class="sd"> >>> df = spark.readStream.format("rate").load()</span> |
| <span class="sd"> >>> type(df.writeStream)</span> |
| <span class="sd"> <class '...streaming.readwriter.DataStreamWriter'></span> |
| |
| <span class="sd"> >>> with tempfile.TemporaryDirectory(prefix="writeStream") as d:</span> |
| <span class="sd"> ... # Create a table with Rate source.</span> |
| <span class="sd"> ... query = df.writeStream.toTable(</span> |
| <span class="sd"> ... "my_table", checkpointLocation=d)</span> |
| <span class="sd"> ... time.sleep(3)</span> |
| <span class="sd"> ... query.stop()</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">schema</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">StructType</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the schema of this :class:`DataFrame` as a :class:`pyspark.sql.types.StructType`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`StructType`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Example 1: Retrieve the inferred schema of the current DataFrame.</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> df.schema</span> |
| <span class="sd"> StructType([StructField('age', LongType(), True),</span> |
| <span class="sd"> StructField('name', StringType(), True)])</span> |
| |
| <span class="sd"> Example 2: Retrieve the schema of the current DataFrame (DDL-formatted schema).</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom"), (23, "Alice"), (16, "Bob")],</span> |
| <span class="sd"> ... "age INT, name STRING")</span> |
| <span class="sd"> >>> df.schema</span> |
| <span class="sd"> StructType([StructField('age', IntegerType(), True),</span> |
| <span class="sd"> StructField('name', StringType(), True)])</span> |
| |
| <span class="sd"> Example 3: Retrieve the specified schema of the current DataFrame.</span> |
| |
| <span class="sd"> >>> from pyspark.sql.types import StructType, StructField, StringType</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [("a",), ("b",), ("c",)],</span> |
| <span class="sd"> ... StructType([StructField("value", StringType(), False)]))</span> |
| <span class="sd"> >>> df.schema</span> |
| <span class="sd"> StructType([StructField('value', StringType(), False)])</span> |
| |
| <span class="sd"> """</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.printSchema"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.printSchema.html#pyspark.sql.DataFrame.printSchema">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">printSchema</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">level</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Prints out the schema in the tree format.</span> |
| <span class="sd"> Optionally allows to specify how many levels to print if schema is nested.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> level : int, optional</span> |
| <span class="sd"> How many levels to print for nested schemas.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Example 1: Printing the schema of a DataFrame with basic columns</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> df.printSchema()</span> |
| <span class="sd"> root</span> |
| <span class="sd"> |-- age: long (nullable = true)</span> |
| <span class="sd"> |-- name: string (nullable = true)</span> |
| |
| <span class="sd"> Example 2: Printing the schema with a specified level for nested columns</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(1, (2, 2))], ["a", "b"])</span> |
| <span class="sd"> >>> df.printSchema(1)</span> |
| <span class="sd"> root</span> |
| <span class="sd"> |-- a: long (nullable = true)</span> |
| <span class="sd"> |-- b: struct (nullable = true)</span> |
| |
| <span class="sd"> Example 3: Printing the schema with deeper nesting level</span> |
| |
| <span class="sd"> >>> df.printSchema(2)</span> |
| <span class="sd"> root</span> |
| <span class="sd"> |-- a: long (nullable = true)</span> |
| <span class="sd"> |-- b: struct (nullable = true)</span> |
| <span class="sd"> | |-- _1: long (nullable = true)</span> |
| <span class="sd"> | |-- _2: long (nullable = true)</span> |
| |
| <span class="sd"> Example 4: Printing the schema of a DataFrame with nullable and non-nullable columns</span> |
| |
| <span class="sd"> >>> df = spark.range(1).selectExpr("id AS nonnullable", "NULL AS nullable")</span> |
| <span class="sd"> >>> df.printSchema()</span> |
| <span class="sd"> root</span> |
| <span class="sd"> |-- nonnullable: long (nullable = false)</span> |
| <span class="sd"> |-- nullable: void (nullable = true)</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.explain"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.explain.html#pyspark.sql.DataFrame.explain">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">explain</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">extended</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">bool</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="n">mode</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Prints the (logical and physical) plans to the console for debugging purposes.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> extended : bool, optional</span> |
| <span class="sd"> default ``False``. If ``False``, prints only the physical plan.</span> |
| <span class="sd"> When this is a string without specifying the ``mode``, it works as the mode is</span> |
| <span class="sd"> specified.</span> |
| <span class="sd"> mode : str, optional</span> |
| <span class="sd"> specifies the expected output format of plans.</span> |
| |
| <span class="sd"> * ``simple``: Print only a physical plan.</span> |
| <span class="sd"> * ``extended``: Print both logical and physical plans.</span> |
| <span class="sd"> * ``codegen``: Print a physical plan and generated codes if they are available.</span> |
| <span class="sd"> * ``cost``: Print a logical plan and statistics if they are available.</span> |
| <span class="sd"> * ``formatted``: Split explain output into two sections: a physical plan outline \</span> |
| <span class="sd"> and node details.</span> |
| |
| <span class="sd"> .. versionchanged:: 3.0.0</span> |
| <span class="sd"> Added optional argument `mode` to specify the expected output format of plans.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Example 1: Print out the physical plan only (default).</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> df.explain() # doctest: +SKIP</span> |
| <span class="sd"> == Physical Plan ==</span> |
| <span class="sd"> *(1) Scan ExistingRDD[age...,name...]</span> |
| |
| <span class="sd"> Example 2: Print out all parsed, analyzed, optimized, and physical plans.</span> |
| |
| <span class="sd"> >>> df.explain(extended=True)</span> |
| <span class="sd"> == Parsed Logical Plan ==</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> == Analyzed Logical Plan ==</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> == Optimized Logical Plan ==</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> == Physical Plan ==</span> |
| <span class="sd"> ...</span> |
| |
| <span class="sd"> Example 3: Print out the plans with two sections: a physical plan outline and node details.</span> |
| |
| <span class="sd"> >>> df.explain(mode="formatted") # doctest: +SKIP</span> |
| <span class="sd"> == Physical Plan ==</span> |
| <span class="sd"> * Scan ExistingRDD (...)</span> |
| <span class="sd"> (1) Scan ExistingRDD [codegen id : ...]</span> |
| <span class="sd"> Output [2]: [age..., name...]</span> |
| <span class="sd"> ...</span> |
| |
| <span class="sd"> Example 4: Print a logical plan and statistics if they are available.</span> |
| |
| <span class="sd"> >>> df.explain(mode="cost")</span> |
| <span class="sd"> == Optimized Logical Plan ==</span> |
| <span class="sd"> ...Statistics...</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.exceptAll"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.exceptAll.html#pyspark.sql.DataFrame.exceptAll">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">exceptAll</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"DataFrame"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Return a new :class:`DataFrame` containing rows in this :class:`DataFrame` but</span> |
| <span class="sd"> not in another :class:`DataFrame` while preserving duplicates.</span> |
| |
| <span class="sd"> This is equivalent to `EXCEPT ALL` in SQL.</span> |
| <span class="sd"> As standard in SQL, this function resolves columns by position (not by name).</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : :class:`DataFrame`</span> |
| <span class="sd"> The other :class:`DataFrame` to compare to.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df1 = spark.createDataFrame(</span> |
| <span class="sd"> ... [("a", 1), ("a", 1), ("a", 1), ("a", 2), ("b", 3), ("c", 4)], ["C1", "C2"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([("a", 1), ("b", 3)], ["C1", "C2"])</span> |
| <span class="sd"> >>> df1.exceptAll(df2).show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | C1| C2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | a| 1|</span> |
| <span class="sd"> | a| 1|</span> |
| <span class="sd"> | a| 2|</span> |
| <span class="sd"> | c| 4|</span> |
| <span class="sd"> +---+---+</span> |
| |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.isLocal"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.isLocal.html#pyspark.sql.DataFrame.isLocal">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">isLocal</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns ``True`` if the :func:`collect` and :func:`take` methods can be run locally</span> |
| <span class="sd"> (without any Spark executors).</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> bool</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.sql("SHOW TABLES")</span> |
| <span class="sd"> >>> df.isLocal()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">isStreaming</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns ``True`` if this :class:`DataFrame` contains one or more sources that</span> |
| <span class="sd"> continuously return data as it arrives. A :class:`DataFrame` that reads data from a</span> |
| <span class="sd"> streaming source must be executed as a :class:`StreamingQuery` using the :func:`start`</span> |
| <span class="sd"> method in :class:`DataStreamWriter`. Methods that return a single answer, (e.g.,</span> |
| <span class="sd"> :func:`count` or :func:`collect`) will throw an :class:`AnalysisException` when there</span> |
| <span class="sd"> is a streaming source present.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This API is evolving.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> bool</span> |
| <span class="sd"> Whether it's streaming DataFrame or not.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.readStream.format("rate").load()</span> |
| <span class="sd"> >>> df.isStreaming</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.isEmpty"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.isEmpty.html#pyspark.sql.DataFrame.isEmpty">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">isEmpty</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Checks if the :class:`DataFrame` is empty and returns a boolean value.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> bool</span> |
| <span class="sd"> Returns ``True`` if the DataFrame is empty, ``False`` otherwise.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.count : Counts the number of rows in DataFrame.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> - Unlike `count()`, this method does not trigger any computation.</span> |
| <span class="sd"> - An empty DataFrame has no rows. It may have columns, but no data.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Example 1: Checking if an empty DataFrame is empty</span> |
| |
| <span class="sd"> >>> df_empty = spark.createDataFrame([], 'a STRING')</span> |
| <span class="sd"> >>> df_empty.isEmpty()</span> |
| <span class="sd"> True</span> |
| |
| <span class="sd"> Example 2: Checking if a non-empty DataFrame is empty</span> |
| |
| <span class="sd"> >>> df_non_empty = spark.createDataFrame(["a"], 'STRING')</span> |
| <span class="sd"> >>> df_non_empty.isEmpty()</span> |
| <span class="sd"> False</span> |
| |
| <span class="sd"> Example 3: Checking if a DataFrame with null values is empty</span> |
| |
| <span class="sd"> >>> df_nulls = spark.createDataFrame([(None, None)], 'a STRING, b INT')</span> |
| <span class="sd"> >>> df_nulls.isEmpty()</span> |
| <span class="sd"> False</span> |
| |
| <span class="sd"> Example 4: Checking if a DataFrame with no rows but with columns is empty</span> |
| |
| <span class="sd"> >>> df_no_rows = spark.createDataFrame([], 'id INT, value STRING')</span> |
| <span class="sd"> >>> df_no_rows.isEmpty()</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.show"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.show.html#pyspark.sql.DataFrame.show">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">show</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span> <span class="o">=</span> <span class="mi">20</span><span class="p">,</span> <span class="n">truncate</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">bool</span><span class="p">,</span> <span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> <span class="n">vertical</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Prints the first ``n`` rows of the DataFrame to the console.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> n : int, optional, default 20</span> |
| <span class="sd"> Number of rows to show.</span> |
| <span class="sd"> truncate : bool or int, optional, default True</span> |
| <span class="sd"> If set to ``True``, truncate strings longer than 20 chars.</span> |
| <span class="sd"> If set to a number greater than one, truncates long strings to length ``truncate``</span> |
| <span class="sd"> and align cells right.</span> |
| <span class="sd"> vertical : bool, optional</span> |
| <span class="sd"> If set to ``True``, print output rows vertically (one line per column value).</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... (14, "Tom"), (23, "Alice"), (16, "Bob"), (19, "This is a super long name")],</span> |
| <span class="sd"> ... ["age", "name"])</span> |
| |
| <span class="sd"> Show :class:`DataFrame`</span> |
| |
| <span class="sd"> >>> df.show()</span> |
| <span class="sd"> +---+--------------------+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+--------------------+</span> |
| <span class="sd"> | 14| Tom|</span> |
| <span class="sd"> | 23| Alice|</span> |
| <span class="sd"> | 16| Bob|</span> |
| <span class="sd"> | 19|This is a super l...|</span> |
| <span class="sd"> +---+--------------------+</span> |
| |
| <span class="sd"> Show only top 2 rows.</span> |
| |
| <span class="sd"> >>> df.show(2)</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 14| Tom|</span> |
| <span class="sd"> | 23|Alice|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> only showing top 2 rows</span> |
| |
| <span class="sd"> Show full column content without truncation.</span> |
| |
| <span class="sd"> >>> df.show(truncate=False)</span> |
| <span class="sd"> +---+-------------------------+</span> |
| <span class="sd"> |age|name |</span> |
| <span class="sd"> +---+-------------------------+</span> |
| <span class="sd"> |14 |Tom |</span> |
| <span class="sd"> |23 |Alice |</span> |
| <span class="sd"> |16 |Bob |</span> |
| <span class="sd"> |19 |This is a super long name|</span> |
| <span class="sd"> +---+-------------------------+</span> |
| |
| <span class="sd"> Show :class:`DataFrame` where the maximum number of characters is 3.</span> |
| |
| <span class="sd"> >>> df.show(truncate=3)</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> |age|name|</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> | 14| Tom|</span> |
| <span class="sd"> | 23| Ali|</span> |
| <span class="sd"> | 16| Bob|</span> |
| <span class="sd"> | 19| Thi|</span> |
| <span class="sd"> +---+----+</span> |
| |
| <span class="sd"> Show :class:`DataFrame` vertically.</span> |
| |
| <span class="sd"> >>> df.show(vertical=True)</span> |
| <span class="sd"> -RECORD 0--------------------</span> |
| <span class="sd"> age | 14</span> |
| <span class="sd"> name | Tom</span> |
| <span class="sd"> -RECORD 1--------------------</span> |
| <span class="sd"> age | 23</span> |
| <span class="sd"> name | Alice</span> |
| <span class="sd"> -RECORD 2--------------------</span> |
| <span class="sd"> age | 16</span> |
| <span class="sd"> name | Bob</span> |
| <span class="sd"> -RECORD 3--------------------</span> |
| <span class="sd"> age | 19</span> |
| <span class="sd"> name | This is a super l...</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="fm">__repr__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">str</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">_repr_html_</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""Returns a :class:`DataFrame` with html code when you enabled eager evaluation</span> |
| <span class="sd"> by 'spark.sql.repl.eagerEval.enabled', this only called by REPL you are</span> |
| <span class="sd"> using support eager evaluation with HTML.</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.checkpoint"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.checkpoint.html#pyspark.sql.DataFrame.checkpoint">[docs]</a> <span class="k">def</span> <span class="nf">checkpoint</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">eager</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a checkpointed version of this :class:`DataFrame`. Checkpointing can be</span> |
| <span class="sd"> used to truncate the logical plan of this :class:`DataFrame`, which is especially</span> |
| <span class="sd"> useful in iterative algorithms where the plan may grow exponentially. It will be</span> |
| <span class="sd"> saved to files inside the checkpoint directory set with</span> |
| <span class="sd"> :meth:`SparkContext.setCheckpointDir`, or `spark.checkpoint.dir` configuration.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 4.0.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> eager : bool, optional, default True</span> |
| <span class="sd"> Whether to checkpoint this :class:`DataFrame` immediately.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Checkpointed DataFrame.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This API is experimental.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... (14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> df.checkpoint(False) # doctest: +SKIP</span> |
| <span class="sd"> DataFrame[age: bigint, name: string]</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.localCheckpoint"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.localCheckpoint.html#pyspark.sql.DataFrame.localCheckpoint">[docs]</a> <span class="k">def</span> <span class="nf">localCheckpoint</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">eager</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a locally checkpointed version of this :class:`DataFrame`. Checkpointing can</span> |
| <span class="sd"> be used to truncate the logical plan of this :class:`DataFrame`, which is especially</span> |
| <span class="sd"> useful in iterative algorithms where the plan may grow exponentially. Local checkpoints</span> |
| <span class="sd"> are stored in the executors using the caching subsystem and therefore they are not</span> |
| <span class="sd"> reliable.</span> |
| |
| <span class="sd"> .. versionadded:: 2.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 4.0.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> eager : bool, optional, default True</span> |
| <span class="sd"> Whether to checkpoint this :class:`DataFrame` immediately.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Checkpointed DataFrame.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This API is experimental.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... (14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> df.localCheckpoint(False)</span> |
| <span class="sd"> DataFrame[age: bigint, name: string]</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.withWatermark"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.withWatermark.html#pyspark.sql.DataFrame.withWatermark">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">withWatermark</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">eventTime</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">delayThreshold</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Defines an event time watermark for this :class:`DataFrame`. A watermark tracks a point</span> |
| <span class="sd"> in time before which we assume no more late data is going to arrive.</span> |
| |
| <span class="sd"> Spark will use this watermark for several purposes:</span> |
| <span class="sd"> - To know when a given time window aggregation can be finalized and thus can be emitted</span> |
| <span class="sd"> when using output modes that do not allow updates.</span> |
| |
| <span class="sd"> - To minimize the amount of state that we need to keep for on-going aggregations.</span> |
| |
| <span class="sd"> The current watermark is computed by looking at the `MAX(eventTime)` seen across</span> |
| <span class="sd"> all of the partitions in the query minus a user specified `delayThreshold`. Due to the cost</span> |
| <span class="sd"> of coordinating this value across partitions, the actual watermark used is only guaranteed</span> |
| <span class="sd"> to be at least `delayThreshold` behind the actual event time. In some cases we may still</span> |
| <span class="sd"> process records that arrive more than `delayThreshold` late.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.5.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> eventTime : str</span> |
| <span class="sd"> the name of the column that contains the event time of the row.</span> |
| <span class="sd"> delayThreshold : str</span> |
| <span class="sd"> the minimum delay to wait to data to arrive late, relative to the</span> |
| <span class="sd"> latest record that has been processed in the form of an interval</span> |
| <span class="sd"> (e.g. "1 minute" or "5 hours").</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Watermarked DataFrame</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This is a feature only for Structured Streaming.</span> |
| |
| <span class="sd"> This API is evolving.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Row</span> |
| <span class="sd"> >>> from pyspark.sql.functions import timestamp_seconds</span> |
| <span class="sd"> >>> df = spark.readStream.format("rate").load().selectExpr(</span> |
| <span class="sd"> ... "value % 5 AS value", "timestamp")</span> |
| <span class="sd"> >>> df.select("value", df.timestamp.alias("time")).withWatermark("time", '10 minutes')</span> |
| <span class="sd"> DataFrame[value: bigint, time: timestamp]</span> |
| |
| <span class="sd"> Group the data by window and value (0 - 4), and compute the count of each group.</span> |
| |
| <span class="sd"> >>> import time</span> |
| <span class="sd"> >>> from pyspark.sql.functions import window</span> |
| <span class="sd"> >>> query = (df</span> |
| <span class="sd"> ... .withWatermark("timestamp", "10 minutes")</span> |
| <span class="sd"> ... .groupBy(</span> |
| <span class="sd"> ... window(df.timestamp, "10 minutes", "5 minutes"),</span> |
| <span class="sd"> ... df.value)</span> |
| <span class="sd"> ... ).count().writeStream.outputMode("complete").format("console").start()</span> |
| <span class="sd"> >>> time.sleep(3)</span> |
| <span class="sd"> >>> query.stop()</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.hint"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.hint.html#pyspark.sql.DataFrame.hint">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">hint</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="o">*</span><span class="n">parameters</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"PrimitiveType"</span><span class="p">,</span> <span class="s2">"Column"</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="s2">"PrimitiveType"</span><span class="p">]]</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Specifies some hint on the current :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.2.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> name : str</span> |
| <span class="sd"> A name of the hint.</span> |
| <span class="sd"> parameters : str, list, float or int</span> |
| <span class="sd"> Optional parameters.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Hinted DataFrame</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([Row(height=80, name="Tom"), Row(height=85, name="Bob")])</span> |
| <span class="sd"> >>> df.join(df2, "name").explain() # doctest: +SKIP</span> |
| <span class="sd"> == Physical Plan ==</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> ... +- SortMergeJoin ...</span> |
| <span class="sd"> ...</span> |
| |
| <span class="sd"> Explicitly trigger the broadcast hashjoin by providing the hint in ``df2``.</span> |
| |
| <span class="sd"> >>> df.join(df2.hint("broadcast"), "name").explain()</span> |
| <span class="sd"> == Physical Plan ==</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> ... +- BroadcastHashJoin ...</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.count"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.count.html#pyspark.sql.DataFrame.count">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">count</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the number of rows in this :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> int</span> |
| <span class="sd"> Number of rows.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| |
| <span class="sd"> Return the number of rows in the :class:`DataFrame`.</span> |
| |
| <span class="sd"> >>> df.count()</span> |
| <span class="sd"> 3</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.collect"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.collect.html#pyspark.sql.DataFrame.collect">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">collect</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">Row</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""Returns all the records in the DataFrame as a list of :class:`Row`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> list</span> |
| <span class="sd"> A list of :class:`Row` objects, each representing a row in the DataFrame.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.take : Returns the first `n` rows.</span> |
| <span class="sd"> DataFrame.head : Returns the first `n` rows.</span> |
| <span class="sd"> DataFrame.toPandas : Returns the data as a pandas DataFrame.</span> |
| <span class="sd"> DataFrame.toArrow : Returns the data as a PyArrow Table.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This method should only be used if the resulting list is expected to be small,</span> |
| <span class="sd"> as all the data is loaded into the driver's memory.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Example: Collecting all rows of a DataFrame</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> df.collect()</span> |
| <span class="sd"> [Row(age=14, name='Tom'), Row(age=23, name='Alice'), Row(age=16, name='Bob')]</span> |
| |
| <span class="sd"> Example: Collecting all rows after filtering</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> df.filter(df.age > 15).collect()</span> |
| <span class="sd"> [Row(age=23, name='Alice'), Row(age=16, name='Bob')]</span> |
| |
| <span class="sd"> Example: Collecting all rows after selecting specific columns</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> df.select("name").collect()</span> |
| <span class="sd"> [Row(name='Tom'), Row(name='Alice'), Row(name='Bob')]</span> |
| |
| <span class="sd"> Example: Collecting all rows after applying a function to a column</span> |
| |
| <span class="sd"> >>> from pyspark.sql.functions import upper</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> df.select(upper(df.name)).collect()</span> |
| <span class="sd"> [Row(upper(name)='TOM'), Row(upper(name)='ALICE'), Row(upper(name)='BOB')]</span> |
| |
| <span class="sd"> Example: Collecting all rows from a DataFrame and converting a specific column to a list</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> rows = df.collect()</span> |
| <span class="sd"> >>> [row["name"] for row in rows]</span> |
| <span class="sd"> ['Tom', 'Alice', 'Bob']</span> |
| |
| <span class="sd"> Example: Collecting all rows from a DataFrame and converting to a list of dictionaries</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> rows = df.collect()</span> |
| <span class="sd"> >>> [row.asDict() for row in rows]</span> |
| <span class="sd"> [{'age': 14, 'name': 'Tom'}, {'age': 23, 'name': 'Alice'}, {'age': 16, 'name': 'Bob'}]</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.toLocalIterator"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.toLocalIterator.html#pyspark.sql.DataFrame.toLocalIterator">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">toLocalIterator</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">prefetchPartitions</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="n">Iterator</span><span class="p">[</span><span class="n">Row</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns an iterator that contains all of the rows in this :class:`DataFrame`.</span> |
| <span class="sd"> The iterator will consume as much memory as the largest partition in this</span> |
| <span class="sd"> :class:`DataFrame`. With prefetch it may consume up to the memory of the 2 largest</span> |
| <span class="sd"> partitions.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> prefetchPartitions : bool, optional</span> |
| <span class="sd"> If Spark should pre-fetch the next partition before it is needed.</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> This argument does not take effect for Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> Iterator</span> |
| <span class="sd"> Iterator of rows.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> list(df.toLocalIterator())</span> |
| <span class="sd"> [Row(age=14, name='Tom'), Row(age=23, name='Alice'), Row(age=16, name='Bob')]</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.limit"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.limit.html#pyspark.sql.DataFrame.limit">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">limit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">num</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Limits the result count to the number specified.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> num : int</span> |
| <span class="sd"> Number of records to return. Will return this number of records</span> |
| <span class="sd"> or all records if the DataFrame contains less than this number of records.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Subset of the records</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> df.limit(1).show()</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> |age|name|</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> | 14| Tom|</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> >>> df.limit(0).show()</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> |age|name|</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.offset"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.offset.html#pyspark.sql.DataFrame.offset">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">offset</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">num</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a new :class: `DataFrame` by skipping the first `n` rows.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.5.0</span> |
| <span class="sd"> Supports vanilla PySpark.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> num : int</span> |
| <span class="sd"> Number of records to skip.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Subset of the records</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> df.offset(1).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 23|Alice|</span> |
| <span class="sd"> | 16| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> >>> df.offset(10).show()</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> |age|name|</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.take"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.take.html#pyspark.sql.DataFrame.take">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">take</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">num</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">Row</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""Returns the first ``num`` rows as a :class:`list` of :class:`Row`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> num : int</span> |
| <span class="sd"> Number of records to return. Will return this number of records</span> |
| <span class="sd"> or all records if the DataFrame contains less than this number of records..</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> list</span> |
| <span class="sd"> List of rows</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| |
| <span class="sd"> Return the first 2 rows of the :class:`DataFrame`.</span> |
| |
| <span class="sd"> >>> df.take(2)</span> |
| <span class="sd"> [Row(age=14, name='Tom'), Row(age=23, name='Alice')]</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.tail"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.tail.html#pyspark.sql.DataFrame.tail">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">tail</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">num</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">Row</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the last ``num`` rows as a :class:`list` of :class:`Row`.</span> |
| |
| <span class="sd"> Running tail requires moving data into the application's driver process, and doing so with</span> |
| <span class="sd"> a very large ``num`` can crash the driver process with OutOfMemoryError.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> num : int</span> |
| <span class="sd"> Number of records to return. Will return this number of records</span> |
| <span class="sd"> or all records if the DataFrame contains less than this number of records.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> list</span> |
| <span class="sd"> List of rows</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| |
| <span class="sd"> >>> df.tail(2)</span> |
| <span class="sd"> [Row(age=23, name='Alice'), Row(age=16, name='Bob')]</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.foreach"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.foreach.html#pyspark.sql.DataFrame.foreach">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">foreach</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Row</span><span class="p">],</span> <span class="kc">None</span><span class="p">])</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Applies the ``f`` function to all :class:`Row` of this :class:`DataFrame`.</span> |
| |
| <span class="sd"> This is a shorthand for ``df.rdd.foreach()``.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 4.0.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> f : function</span> |
| <span class="sd"> A function that accepts one parameter which will</span> |
| <span class="sd"> receive each row to process.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> def func(person):</span> |
| <span class="sd"> ... print(person.name)</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> df.foreach(func)</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.foreachPartition"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.foreachPartition.html#pyspark.sql.DataFrame.foreachPartition">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">foreachPartition</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">f</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[[</span><span class="n">Iterator</span><span class="p">[</span><span class="n">Row</span><span class="p">]],</span> <span class="kc">None</span><span class="p">])</span> <span class="o">-></span> <span class="kc">None</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Applies the ``f`` function to each partition of this :class:`DataFrame`.</span> |
| |
| <span class="sd"> This a shorthand for ``df.rdd.foreachPartition()``.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 4.0.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> f : function</span> |
| <span class="sd"> A function that accepts one parameter which will receive</span> |
| <span class="sd"> each partition to process.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> def func(itr):</span> |
| <span class="sd"> ... for person in itr:</span> |
| <span class="sd"> ... print(person.name)</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> df.foreachPartition(func)</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.cache"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.cache.html#pyspark.sql.DataFrame.cache">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">cache</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Persists the :class:`DataFrame` with the default storage level (`MEMORY_AND_DISK_DESER`).</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The default storage level has changed to `MEMORY_AND_DISK_DESER` to match Scala in 3.0.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Cached DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.cache()</span> |
| <span class="sd"> DataFrame[id: bigint]</span> |
| |
| <span class="sd"> >>> df.explain()</span> |
| <span class="sd"> == Physical Plan ==</span> |
| <span class="sd"> InMemoryTableScan ...</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.persist"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.persist.html#pyspark.sql.DataFrame.persist">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">persist</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">storageLevel</span><span class="p">:</span> <span class="n">StorageLevel</span> <span class="o">=</span> <span class="p">(</span><span class="n">StorageLevel</span><span class="o">.</span><span class="n">MEMORY_AND_DISK_DESER</span><span class="p">),</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Sets the storage level to persist the contents of the :class:`DataFrame` across</span> |
| <span class="sd"> operations after the first time it is computed. This can only be used to assign</span> |
| <span class="sd"> a new storage level if the :class:`DataFrame` does not have a storage level set yet.</span> |
| <span class="sd"> If no storage level is specified defaults to (`MEMORY_AND_DISK_DESER`)</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The default storage level has changed to `MEMORY_AND_DISK_DESER` to match Scala in 3.0.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> storageLevel : :class:`StorageLevel`</span> |
| <span class="sd"> Storage level to set for persistence. Default is MEMORY_AND_DISK_DESER.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Persisted DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.persist()</span> |
| <span class="sd"> DataFrame[id: bigint]</span> |
| |
| <span class="sd"> >>> df.explain()</span> |
| <span class="sd"> == Physical Plan ==</span> |
| <span class="sd"> InMemoryTableScan ...</span> |
| |
| <span class="sd"> Persists the data in the disk by specifying the storage level.</span> |
| |
| <span class="sd"> >>> from pyspark.storagelevel import StorageLevel</span> |
| <span class="sd"> >>> df.persist(StorageLevel.DISK_ONLY)</span> |
| <span class="sd"> DataFrame[id: bigint]</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">storageLevel</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">StorageLevel</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Get the :class:`DataFrame`'s current storage level.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`StorageLevel`</span> |
| <span class="sd"> Currently defined storage level.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df1 = spark.range(10)</span> |
| <span class="sd"> >>> df1.storageLevel</span> |
| <span class="sd"> StorageLevel(False, False, False, False, 1)</span> |
| <span class="sd"> >>> df1.cache().storageLevel</span> |
| <span class="sd"> StorageLevel(True, True, False, True, 1)</span> |
| |
| <span class="sd"> >>> df2 = spark.range(5)</span> |
| <span class="sd"> >>> df2.persist(StorageLevel.DISK_ONLY_2).storageLevel</span> |
| <span class="sd"> StorageLevel(True, False, False, False, 2)</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.unpersist"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.unpersist.html#pyspark.sql.DataFrame.unpersist">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">unpersist</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">blocking</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Marks the :class:`DataFrame` as non-persistent, and remove all blocks for it from</span> |
| <span class="sd"> memory and disk.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> `blocking` default has changed to ``False`` to match Scala in 2.0.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> blocking : bool</span> |
| <span class="sd"> Whether to block until all blocks are deleted.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Unpersisted DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.persist()</span> |
| <span class="sd"> DataFrame[id: bigint]</span> |
| <span class="sd"> >>> df.unpersist()</span> |
| <span class="sd"> DataFrame[id: bigint]</span> |
| <span class="sd"> >>> df = spark.range(1)</span> |
| <span class="sd"> >>> df.unpersist(True)</span> |
| <span class="sd"> DataFrame[id: bigint]</span> |
| <span class="sd"> """</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">is_cached</span> <span class="o">=</span> <span class="kc">False</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">unpersist</span><span class="p">(</span><span class="n">blocking</span><span class="p">)</span> |
| <span class="k">return</span> <span class="bp">self</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.coalesce"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.coalesce.html#pyspark.sql.DataFrame.coalesce">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">coalesce</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a new :class:`DataFrame` that has exactly `numPartitions` partitions.</span> |
| |
| <span class="sd"> Similar to coalesce defined on an :class:`RDD`, this operation results in a</span> |
| <span class="sd"> narrow dependency, e.g. if you go from 1000 partitions to 100 partitions,</span> |
| <span class="sd"> there will not be a shuffle, instead each of the 100 new partitions will</span> |
| <span class="sd"> claim 10 of the current partitions. If a larger number of partitions is requested,</span> |
| <span class="sd"> it will stay at the current number of partitions.</span> |
| |
| <span class="sd"> However, if you're doing a drastic coalesce, e.g. to numPartitions = 1,</span> |
| <span class="sd"> this may result in your computation taking place on fewer nodes than</span> |
| <span class="sd"> you like (e.g. one node in the case of numPartitions = 1). To avoid this,</span> |
| <span class="sd"> you can call repartition(). This will add a shuffle step, but means the</span> |
| <span class="sd"> current upstream partitions will be executed in parallel (per whatever</span> |
| <span class="sd"> the current partitioning is).</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> numPartitions : int</span> |
| <span class="sd"> specify the target number of partitions</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import functions as sf</span> |
| <span class="sd"> >>> spark.range(0, 10, 1, 3).select(</span> |
| <span class="sd"> ... sf.spark_partition_id().alias("partition")</span> |
| <span class="sd"> ... ).distinct().sort("partition").show()</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> |partition|</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> +---------+</span> |
| |
| <span class="sd"> >>> from pyspark.sql import functions as sf</span> |
| <span class="sd"> >>> spark.range(0, 10, 1, 3).coalesce(1).select(</span> |
| <span class="sd"> ... sf.spark_partition_id().alias("partition")</span> |
| <span class="sd"> ... ).distinct().sort("partition").show()</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> |partition|</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> """</span> |
| <span class="k">return</span> <span class="n">DataFrame</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">_jdf</span><span class="o">.</span><span class="n">coalesce</span><span class="p">(</span><span class="n">numPartitions</span><span class="p">),</span> <span class="bp">self</span><span class="o">.</span><span class="n">sparkSession</span><span class="p">)</span></div> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">repartition</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">repartition</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.repartition"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.repartition.html#pyspark.sql.DataFrame.repartition">[docs]</a> <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="k">def</span> <span class="nf">repartition</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="s2">"ColumnOrName"</span><span class="p">],</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a new :class:`DataFrame` partitioned by the given partitioning expressions. The</span> |
| <span class="sd"> resulting :class:`DataFrame` is hash partitioned.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> numPartitions : int</span> |
| <span class="sd"> can be an int to specify the target number of partitions or a Column.</span> |
| <span class="sd"> If it is a Column, it will be used as the first partitioning column. If not specified,</span> |
| <span class="sd"> the default number of partitions is used.</span> |
| <span class="sd"> cols : str or :class:`Column`</span> |
| <span class="sd"> partitioning columns.</span> |
| |
| <span class="sd"> .. versionchanged:: 1.6.0</span> |
| <span class="sd"> Added optional arguments to specify the partitioning columns. Also made numPartitions</span> |
| <span class="sd"> optional if partitioning columns are specified.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Repartitioned DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import functions as sf</span> |
| <span class="sd"> >>> df = spark.range(0, 64, 1, 9).withColumn(</span> |
| <span class="sd"> ... "name", sf.concat(sf.lit("name_"), sf.col("id").cast("string"))</span> |
| <span class="sd"> ... ).withColumn(</span> |
| <span class="sd"> ... "age", sf.col("id") - 32</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.select(</span> |
| <span class="sd"> ... sf.spark_partition_id().alias("partition")</span> |
| <span class="sd"> ... ).distinct().sort("partition").show()</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> |partition|</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> | 4|</span> |
| <span class="sd"> | 5|</span> |
| <span class="sd"> | 6|</span> |
| <span class="sd"> | 7|</span> |
| <span class="sd"> | 8|</span> |
| <span class="sd"> +---------+</span> |
| |
| <span class="sd"> Repartition the data into 10 partitions.</span> |
| |
| <span class="sd"> >>> df.repartition(10).select(</span> |
| <span class="sd"> ... sf.spark_partition_id().alias("partition")</span> |
| <span class="sd"> ... ).distinct().sort("partition").show()</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> |partition|</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> | 4|</span> |
| <span class="sd"> | 5|</span> |
| <span class="sd"> | 6|</span> |
| <span class="sd"> | 7|</span> |
| <span class="sd"> | 8|</span> |
| <span class="sd"> | 9|</span> |
| <span class="sd"> +---------+</span> |
| |
| <span class="sd"> Repartition the data into 7 partitions by 'age' column.</span> |
| |
| <span class="sd"> >>> df.repartition(7, "age").select(</span> |
| <span class="sd"> ... sf.spark_partition_id().alias("partition")</span> |
| <span class="sd"> ... ).distinct().sort("partition").show()</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> |partition|</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> | 4|</span> |
| <span class="sd"> | 5|</span> |
| <span class="sd"> | 6|</span> |
| <span class="sd"> +---------+</span> |
| |
| <span class="sd"> Repartition the data into 3 partitions by 'age' and 'name' columns.</span> |
| |
| <span class="sd"> >>> df.repartition(3, "name", "age").select(</span> |
| <span class="sd"> ... sf.spark_partition_id().alias("partition")</span> |
| <span class="sd"> ... ).distinct().sort("partition").show()</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> |partition|</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> | 0|</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> +---------+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">repartitionByRange</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">:</span> <span class="nb">int</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">repartitionByRange</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.repartitionByRange"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.repartitionByRange.html#pyspark.sql.DataFrame.repartitionByRange">[docs]</a> <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="k">def</span> <span class="nf">repartitionByRange</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">numPartitions</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="s2">"ColumnOrName"</span><span class="p">],</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a new :class:`DataFrame` partitioned by the given partitioning expressions. The</span> |
| <span class="sd"> resulting :class:`DataFrame` is range partitioned.</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> numPartitions : int</span> |
| <span class="sd"> can be an int to specify the target number of partitions or a Column.</span> |
| <span class="sd"> If it is a Column, it will be used as the first partitioning column. If not specified,</span> |
| <span class="sd"> the default number of partitions is used.</span> |
| <span class="sd"> cols : str or :class:`Column`</span> |
| <span class="sd"> partitioning columns.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Repartitioned DataFrame.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> At least one partition-by expression must be specified.</span> |
| <span class="sd"> When no explicit sort order is specified, "ascending nulls first" is assumed.</span> |
| |
| <span class="sd"> Due to performance reasons this method uses sampling to estimate the ranges.</span> |
| <span class="sd"> Hence, the output may not be consistent, since sampling can return different values.</span> |
| <span class="sd"> The sample size can be controlled by the config</span> |
| <span class="sd"> `spark.sql.execution.rangeExchange.sampleSizePerPartition`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Repartition the data into 2 partitions by range in 'age' column.</span> |
| <span class="sd"> For example, the first partition can have ``(14, "Tom")`` and ``(16, "Bob")``,</span> |
| <span class="sd"> and the second partition would have ``(23, "Alice")``.</span> |
| |
| <span class="sd"> >>> from pyspark.sql import functions as sf</span> |
| <span class="sd"> >>> spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"]</span> |
| <span class="sd"> ... ).repartitionByRange(2, "age").select(</span> |
| <span class="sd"> ... "age", "name", sf.spark_partition_id()</span> |
| <span class="sd"> ... ).show()</span> |
| <span class="sd"> +---+-----+--------------------+</span> |
| <span class="sd"> |age| name|SPARK_PARTITION_ID()|</span> |
| <span class="sd"> +---+-----+--------------------+</span> |
| <span class="sd"> | 14| Tom| 0|</span> |
| <span class="sd"> | 16| Bob| 0|</span> |
| <span class="sd"> | 23|Alice| 1|</span> |
| <span class="sd"> +---+-----+--------------------+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.distinct"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.distinct.html#pyspark.sql.DataFrame.distinct">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">distinct</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a new :class:`DataFrame` containing the distinct rows in this :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> DataFrame with distinct records.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.dropDuplicates</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Remove duplicate rows from a DataFrame</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom"), (23, "Alice"), (23, "Alice")], ["age", "name"])</span> |
| <span class="sd"> >>> df.distinct().show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 14| Tom|</span> |
| <span class="sd"> | 23|Alice|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> Count the number of distinct rows in a DataFrame</span> |
| |
| <span class="sd"> >>> df.distinct().count()</span> |
| <span class="sd"> 2</span> |
| |
| <span class="sd"> Get distinct rows from a DataFrame with multiple columns</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom", "M"), (23, "Alice", "F"), (23, "Alice", "F"), (14, "Tom", "M")],</span> |
| <span class="sd"> ... ["age", "name", "gender"])</span> |
| <span class="sd"> >>> df.distinct().show()</span> |
| <span class="sd"> +---+-----+------+</span> |
| <span class="sd"> |age| name|gender|</span> |
| <span class="sd"> +---+-----+------+</span> |
| <span class="sd"> | 14| Tom| M|</span> |
| <span class="sd"> | 23|Alice| F|</span> |
| <span class="sd"> +---+-----+------+</span> |
| |
| <span class="sd"> Get distinct values from a specific column in a DataFrame</span> |
| |
| <span class="sd"> >>> df.select("name").distinct().show()</span> |
| <span class="sd"> +-----+</span> |
| <span class="sd"> | name|</span> |
| <span class="sd"> +-----+</span> |
| <span class="sd"> | Tom|</span> |
| <span class="sd"> |Alice|</span> |
| <span class="sd"> +-----+</span> |
| |
| <span class="sd"> Count the number of distinct values in a specific column</span> |
| |
| <span class="sd"> >>> df.select("name").distinct().count()</span> |
| <span class="sd"> 2</span> |
| |
| <span class="sd"> Get distinct values from multiple columns in DataFrame</span> |
| |
| <span class="sd"> >>> df.select("name", "gender").distinct().show()</span> |
| <span class="sd"> +-----+------+</span> |
| <span class="sd"> | name|gender|</span> |
| <span class="sd"> +-----+------+</span> |
| <span class="sd"> | Tom| M|</span> |
| <span class="sd"> |Alice| F|</span> |
| <span class="sd"> +-----+------+</span> |
| |
| <span class="sd"> Get distinct rows from a DataFrame with null values</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom", "M"), (23, "Alice", "F"), (23, "Alice", "F"), (14, "Tom", None)],</span> |
| <span class="sd"> ... ["age", "name", "gender"])</span> |
| <span class="sd"> >>> df.distinct().show()</span> |
| <span class="sd"> +---+-----+------+</span> |
| <span class="sd"> |age| name|gender|</span> |
| <span class="sd"> +---+-----+------+</span> |
| <span class="sd"> | 14| Tom| M|</span> |
| <span class="sd"> | 23|Alice| F|</span> |
| <span class="sd"> | 14| Tom| NULL|</span> |
| <span class="sd"> +---+-----+------+</span> |
| |
| <span class="sd"> Get distinct non-null values from a DataFrame</span> |
| |
| <span class="sd"> >>> df.distinct().filter(df.gender.isNotNull()).show()</span> |
| <span class="sd"> +---+-----+------+</span> |
| <span class="sd"> |age| name|gender|</span> |
| <span class="sd"> +---+-----+------+</span> |
| <span class="sd"> | 14| Tom| M|</span> |
| <span class="sd"> | 23|Alice| F|</span> |
| <span class="sd"> +---+-----+------+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">sample</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">fraction</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">sample</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">withReplacement</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">bool</span><span class="p">],</span> |
| <span class="n">fraction</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.sample"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.sample.html#pyspark.sql.DataFrame.sample">[docs]</a> <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="k">def</span> <span class="nf">sample</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">withReplacement</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">float</span><span class="p">,</span> <span class="nb">bool</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">fraction</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">float</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a sampled subset of this :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> withReplacement : bool, optional</span> |
| <span class="sd"> Sample with replacement or not (default ``False``).</span> |
| <span class="sd"> fraction : float, optional</span> |
| <span class="sd"> Fraction of rows to generate, range [0.0, 1.0].</span> |
| <span class="sd"> seed : int, optional</span> |
| <span class="sd"> Seed for sampling (default a random seed).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Sampled rows from given DataFrame.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This is not guaranteed to provide exactly the fraction specified of the total</span> |
| <span class="sd"> count of the given :class:`DataFrame`.</span> |
| |
| <span class="sd"> `fraction` is required and, `withReplacement` and `seed` are optional.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.range(10)</span> |
| <span class="sd"> >>> df.sample(0.5, 3).count() # doctest: +SKIP</span> |
| <span class="sd"> 7</span> |
| <span class="sd"> >>> df.sample(fraction=0.5, seed=3).count() # doctest: +SKIP</span> |
| <span class="sd"> 7</span> |
| <span class="sd"> >>> df.sample(withReplacement=True, fraction=0.5, seed=3).count() # doctest: +SKIP</span> |
| <span class="sd"> 1</span> |
| <span class="sd"> >>> df.sample(1.0).count()</span> |
| <span class="sd"> 10</span> |
| <span class="sd"> >>> df.sample(fraction=1.0).count()</span> |
| <span class="sd"> 10</span> |
| <span class="sd"> >>> df.sample(False, fraction=1.0).count()</span> |
| <span class="sd"> 10</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.sampleBy"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.sampleBy.html#pyspark.sql.DataFrame.sampleBy">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">sampleBy</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">col</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">fractions</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="nb">float</span><span class="p">],</span> <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a stratified sample without replacement based on the</span> |
| <span class="sd"> fraction given on each stratum.</span> |
| |
| <span class="sd"> .. versionadded:: 1.5.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col : :class:`Column` or str</span> |
| <span class="sd"> column that defines strata</span> |
| |
| <span class="sd"> .. versionchanged:: 3.0.0</span> |
| <span class="sd"> Added sampling by a column of :class:`Column`</span> |
| <span class="sd"> fractions : dict</span> |
| <span class="sd"> sampling fraction for each stratum. If a stratum is not</span> |
| <span class="sd"> specified, we treat its fraction as zero.</span> |
| <span class="sd"> seed : int, optional</span> |
| <span class="sd"> random seed</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> a new :class:`DataFrame` that represents the stratified sample</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.functions import col</span> |
| <span class="sd"> >>> dataset = spark.range(0, 100).select((col("id") % 3).alias("key"))</span> |
| <span class="sd"> >>> sampled = dataset.sampleBy("key", fractions={0: 0.1, 1: 0.2}, seed=0)</span> |
| <span class="sd"> >>> sampled.groupBy("key").count().orderBy("key").show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |key|count|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 0| 3|</span> |
| <span class="sd"> | 1| 6|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> >>> dataset.sampleBy(col("key"), fractions={2: 1.0}, seed=0).count()</span> |
| <span class="sd"> 33</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.randomSplit"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.randomSplit.html#pyspark.sql.DataFrame.randomSplit">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">randomSplit</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">weights</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="s2">"DataFrame"</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""Randomly splits this :class:`DataFrame` with the provided weights.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> weights : list</span> |
| <span class="sd"> list of doubles as weights with which to split the :class:`DataFrame`.</span> |
| <span class="sd"> Weights will be normalized if they don't sum up to 1.0.</span> |
| <span class="sd"> seed : int, optional</span> |
| <span class="sd"> The seed for sampling.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> list</span> |
| <span class="sd"> List of DataFrames.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Row</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... Row(age=10, height=80, name="Alice"),</span> |
| <span class="sd"> ... Row(age=5, height=None, name="Bob"),</span> |
| <span class="sd"> ... Row(age=None, height=None, name="Tom"),</span> |
| <span class="sd"> ... Row(age=None, height=None, name=None),</span> |
| <span class="sd"> ... ])</span> |
| |
| <span class="sd"> >>> splits = df.randomSplit([1.0, 2.0], 24)</span> |
| <span class="sd"> >>> splits[0].count()</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> >>> splits[1].count()</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">dtypes</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]]:</span> |
| <span class="w"> </span><span class="sd">"""Returns all column names and their data types as a list.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> list</span> |
| <span class="sd"> List of columns as tuple pairs.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> df.dtypes</span> |
| <span class="sd"> [('age', 'bigint'), ('name', 'string')]</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@property</span> |
| <span class="k">def</span> <span class="nf">columns</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Retrieves the names of all columns in the :class:`DataFrame` as a list.</span> |
| |
| <span class="sd"> The order of the column names in the list reflects their order in the DataFrame.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> list</span> |
| <span class="sd"> List of column names in the DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Example 1: Retrieve column names of a DataFrame</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom", "CA"), (23, "Alice", "NY"), (16, "Bob", "TX")],</span> |
| <span class="sd"> ... ["age", "name", "state"]</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.columns</span> |
| <span class="sd"> ['age', 'name', 'state']</span> |
| |
| <span class="sd"> Example 2: Using column names to project specific columns</span> |
| |
| <span class="sd"> >>> selected_cols = [col for col in df.columns if col != "age"]</span> |
| <span class="sd"> >>> df.select(selected_cols).show()</span> |
| <span class="sd"> +-----+-----+</span> |
| <span class="sd"> | name|state|</span> |
| <span class="sd"> +-----+-----+</span> |
| <span class="sd"> | Tom| CA|</span> |
| <span class="sd"> |Alice| NY|</span> |
| <span class="sd"> | Bob| TX|</span> |
| <span class="sd"> +-----+-----+</span> |
| |
| <span class="sd"> Example 3: Checking if a specific column exists in a DataFrame</span> |
| |
| <span class="sd"> >>> "state" in df.columns</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> "salary" in df.columns</span> |
| <span class="sd"> False</span> |
| |
| <span class="sd"> Example 4: Iterating over columns to apply a transformation</span> |
| |
| <span class="sd"> >>> import pyspark.sql.functions as f</span> |
| <span class="sd"> >>> for col_name in df.columns:</span> |
| <span class="sd"> ... df = df.withColumn(col_name, f.upper(f.col(col_name)))</span> |
| <span class="sd"> >>> df.show()</span> |
| <span class="sd"> +---+-----+-----+</span> |
| <span class="sd"> |age| name|state|</span> |
| <span class="sd"> +---+-----+-----+</span> |
| <span class="sd"> | 14| TOM| CA|</span> |
| <span class="sd"> | 23|ALICE| NY|</span> |
| <span class="sd"> | 16| BOB| TX|</span> |
| <span class="sd"> +---+-----+-----+</span> |
| |
| <span class="sd"> Example 5: Renaming columns and checking the updated column names</span> |
| |
| <span class="sd"> >>> df = df.withColumnRenamed("name", "first_name")</span> |
| <span class="sd"> >>> df.columns</span> |
| <span class="sd"> ['age', 'first_name', 'state']</span> |
| |
| <span class="sd"> Example 6: Using the `columns` property to ensure two DataFrames have the</span> |
| <span class="sd"> same columns before a union</span> |
| |
| <span class="sd"> >>> df2 = spark.createDataFrame(</span> |
| <span class="sd"> ... [(30, "Eve", "FL"), (40, "Sam", "WA")], ["age", "name", "location"])</span> |
| <span class="sd"> >>> df.columns == df2.columns</span> |
| <span class="sd"> False</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.colRegex"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.colRegex.html#pyspark.sql.DataFrame.colRegex">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">colRegex</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">colName</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Selects column based on the column name specified as a regex and returns it</span> |
| <span class="sd"> as :class:`Column`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> colName : str</span> |
| <span class="sd"> string, column name specified as a regex.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`Column`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("a", 1), ("b", 2), ("c", 3)], ["Col1", "Col2"])</span> |
| <span class="sd"> >>> df.select(df.colRegex("`(Col1)?+.+`")).show()</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> |Col2|</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> | 1|</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> | 3|</span> |
| <span class="sd"> +----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.to"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.to.html#pyspark.sql.DataFrame.to">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">to</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">schema</span><span class="p">:</span> <span class="n">StructType</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a new :class:`DataFrame` where each row is reconciled to match the specified</span> |
| <span class="sd"> schema.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> schema : :class:`StructType`</span> |
| <span class="sd"> Specified schema.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Reconciled DataFrame.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> * Reorder columns and/or inner fields by name to match the specified schema.</span> |
| |
| <span class="sd"> * Project away columns and/or inner fields that are not needed by the specified schema.</span> |
| <span class="sd"> Missing columns and/or inner fields (present in the specified schema but not input</span> |
| <span class="sd"> DataFrame) lead to failures.</span> |
| |
| <span class="sd"> * Cast the columns and/or inner fields to match the data types in the specified schema,</span> |
| <span class="sd"> if the types are compatible, e.g., numeric to numeric (error if overflows), but</span> |
| <span class="sd"> not string to int.</span> |
| |
| <span class="sd"> * Carry over the metadata from the specified schema, while the columns and/or inner fields</span> |
| <span class="sd"> still keep their own metadata if not overwritten by the specified schema.</span> |
| |
| <span class="sd"> * Fail if the nullability is not compatible. For example, the column and/or inner field</span> |
| <span class="sd"> is nullable but the specified schema requires them to be not nullable.</span> |
| |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.types import StructField, StringType</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("a", 1)], ["i", "j"])</span> |
| <span class="sd"> >>> df.schema</span> |
| <span class="sd"> StructType([StructField('i', StringType(), True), StructField('j', LongType(), True)])</span> |
| |
| <span class="sd"> >>> schema = StructType([StructField("j", StringType()), StructField("i", StringType())])</span> |
| <span class="sd"> >>> df2 = df.to(schema)</span> |
| <span class="sd"> >>> df2.schema</span> |
| <span class="sd"> StructType([StructField('j', StringType(), True), StructField('i', StringType(), True)])</span> |
| <span class="sd"> >>> df2.show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | j| i|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | 1| a|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.alias"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.alias.html#pyspark.sql.DataFrame.alias">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">alias</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">alias</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a new :class:`DataFrame` with an alias set.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> alias : str</span> |
| <span class="sd"> an alias name to be set for the :class:`DataFrame`.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Aliased DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.functions import col, desc</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> df_as1 = df.alias("df_as1")</span> |
| <span class="sd"> >>> df_as2 = df.alias("df_as2")</span> |
| <span class="sd"> >>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), 'inner')</span> |
| <span class="sd"> >>> joined_df.select(</span> |
| <span class="sd"> ... "df_as1.name", "df_as2.name", "df_as2.age").sort(desc("df_as1.name")).show()</span> |
| <span class="sd"> +-----+-----+---+</span> |
| <span class="sd"> | name| name|age|</span> |
| <span class="sd"> +-----+-----+---+</span> |
| <span class="sd"> | Tom| Tom| 14|</span> |
| <span class="sd"> | Bob| Bob| 16|</span> |
| <span class="sd"> |Alice|Alice| 23|</span> |
| <span class="sd"> +-----+-----+---+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.crossJoin"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.crossJoin.html#pyspark.sql.DataFrame.crossJoin">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">crossJoin</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"DataFrame"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the cartesian product with another :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : :class:`DataFrame`</span> |
| <span class="sd"> Right side of the cartesian product.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Joined DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Row</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame(</span> |
| <span class="sd"> ... [Row(height=80, name="Tom"), Row(height=85, name="Bob")])</span> |
| <span class="sd"> >>> df.crossJoin(df2.select("height")).select("age", "name", "height").show()</span> |
| <span class="sd"> +---+-----+------+</span> |
| <span class="sd"> |age| name|height|</span> |
| <span class="sd"> +---+-----+------+</span> |
| <span class="sd"> | 14| Tom| 80|</span> |
| <span class="sd"> | 14| Tom| 85|</span> |
| <span class="sd"> | 23|Alice| 80|</span> |
| <span class="sd"> | 23|Alice| 85|</span> |
| <span class="sd"> | 16| Bob| 80|</span> |
| <span class="sd"> | 16| Bob| 85|</span> |
| <span class="sd"> +---+-----+------+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.join"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.join.html#pyspark.sql.DataFrame.join">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">join</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">other</span><span class="p">:</span> <span class="s2">"DataFrame"</span><span class="p">,</span> |
| <span class="n">on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Column</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">how</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Joins with another :class:`DataFrame`, using the given join expression.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : :class:`DataFrame`</span> |
| <span class="sd"> Right side of the join</span> |
| <span class="sd"> on : str, list or :class:`Column`, optional</span> |
| <span class="sd"> a string for the join column name, a list of column names,</span> |
| <span class="sd"> a join expression (Column), or a list of Columns.</span> |
| <span class="sd"> If `on` is a string or a list of strings indicating the name of the join column(s),</span> |
| <span class="sd"> the column(s) must exist on both sides, and this performs an equi-join.</span> |
| <span class="sd"> how : str, optional</span> |
| <span class="sd"> default ``inner``. Must be one of: ``inner``, ``cross``, ``outer``,</span> |
| <span class="sd"> ``full``, ``fullouter``, ``full_outer``, ``left``, ``leftouter``, ``left_outer``,</span> |
| <span class="sd"> ``right``, ``rightouter``, ``right_outer``, ``semi``, ``leftsemi``, ``left_semi``,</span> |
| <span class="sd"> ``anti``, ``leftanti`` and ``left_anti``.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Joined DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> The following examples demonstrate various join types among ``df1``, ``df2``, and ``df3``.</span> |
| |
| <span class="sd"> >>> import pyspark.sql.functions as sf</span> |
| <span class="sd"> >>> from pyspark.sql import Row</span> |
| <span class="sd"> >>> df = spark.createDataFrame([Row(name="Alice", age=2), Row(name="Bob", age=5)])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([Row(name="Tom", height=80), Row(name="Bob", height=85)])</span> |
| <span class="sd"> >>> df3 = spark.createDataFrame([</span> |
| <span class="sd"> ... Row(name="Alice", age=10, height=80),</span> |
| <span class="sd"> ... Row(name="Bob", age=5, height=None),</span> |
| <span class="sd"> ... Row(name="Tom", age=None, height=None),</span> |
| <span class="sd"> ... Row(name=None, age=None, height=None),</span> |
| <span class="sd"> ... ])</span> |
| |
| <span class="sd"> Inner join on columns (default)</span> |
| |
| <span class="sd"> >>> df.join(df2, "name").show()</span> |
| <span class="sd"> +----+---+------+</span> |
| <span class="sd"> |name|age|height|</span> |
| <span class="sd"> +----+---+------+</span> |
| <span class="sd"> | Bob| 5| 85|</span> |
| <span class="sd"> +----+---+------+</span> |
| |
| <span class="sd"> >>> df.join(df3, ["name", "age"]).show()</span> |
| <span class="sd"> +----+---+------+</span> |
| <span class="sd"> |name|age|height|</span> |
| <span class="sd"> +----+---+------+</span> |
| <span class="sd"> | Bob| 5| NULL|</span> |
| <span class="sd"> +----+---+------+</span> |
| |
| <span class="sd"> Outer join on a single column with an explicit join condition.</span> |
| |
| <span class="sd"> When the join condition is explicited stated: `df.name == df2.name`, this will</span> |
| <span class="sd"> produce all records where the names match, as well as those that don't (since</span> |
| <span class="sd"> it's an outer join). If there are names in `df2` that are not present in `df`,</span> |
| <span class="sd"> they will appear with `NULL` in the `name` column of `df`, and vice versa for `df2`.</span> |
| |
| <span class="sd"> >>> joined = df.join(df2, df.name == df2.name, "outer").sort(sf.desc(df.name))</span> |
| <span class="sd"> >>> joined.show() # doctest: +SKIP</span> |
| <span class="sd"> +-----+----+----+------+</span> |
| <span class="sd"> | name| age|name|height|</span> |
| <span class="sd"> +-----+----+----+------+</span> |
| <span class="sd"> | Bob| 5| Bob| 85|</span> |
| <span class="sd"> |Alice| 2|NULL| NULL|</span> |
| <span class="sd"> | NULL|NULL| Tom| 80|</span> |
| <span class="sd"> +-----+----+----+------+</span> |
| |
| <span class="sd"> To unambiguously select output columns, specify the dataframe along with the column name:</span> |
| |
| <span class="sd"> >>> joined.select(df.name, df2.height).show() # doctest: +SKIP</span> |
| <span class="sd"> +-----+------+</span> |
| <span class="sd"> | name|height|</span> |
| <span class="sd"> +-----+------+</span> |
| <span class="sd"> | Bob| 85|</span> |
| <span class="sd"> |Alice| NULL|</span> |
| <span class="sd"> | NULL| 80|</span> |
| <span class="sd"> +-----+------+</span> |
| |
| <span class="sd"> However, in self-joins, direct column references can cause ambiguity:</span> |
| |
| <span class="sd"> >>> df.join(df, df.name == df.name, "outer").select(df.name).show() # doctest: +SKIP</span> |
| <span class="sd"> Traceback (most recent call last):</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> pyspark.errors.exceptions.captured.AnalysisException: Column name#0 are ambiguous...</span> |
| |
| <span class="sd"> A better approach is to assign aliases to the dataframes, and then reference</span> |
| <span class="sd"> the ouptut columns from the join operation using these aliases:</span> |
| |
| <span class="sd"> >>> df.alias("a").join(</span> |
| <span class="sd"> ... df.alias("b"), sf.col("a.name") == sf.col("b.name"), "outer"</span> |
| <span class="sd"> ... ).sort(sf.desc("a.name")).select("a.name", "b.age").show()</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> | name|age|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> | Bob| 5|</span> |
| <span class="sd"> |Alice| 2|</span> |
| <span class="sd"> +-----+---+</span> |
| |
| <span class="sd"> Outer join on a single column with implicit join condition using column name</span> |
| |
| <span class="sd"> When you provide the column name directly as the join condition, Spark will treat</span> |
| <span class="sd"> both name columns as one, and will not produce separate columns for `df.name` and</span> |
| <span class="sd"> `df2.name`. This avoids having duplicate columns in the output.</span> |
| |
| <span class="sd"> >>> df.join(df2, "name", "outer").sort(sf.desc("name")).show()</span> |
| <span class="sd"> +-----+----+------+</span> |
| <span class="sd"> | name| age|height|</span> |
| <span class="sd"> +-----+----+------+</span> |
| <span class="sd"> | Tom|NULL| 80|</span> |
| <span class="sd"> | Bob| 5| 85|</span> |
| <span class="sd"> |Alice| 2| NULL|</span> |
| <span class="sd"> +-----+----+------+</span> |
| |
| <span class="sd"> Outer join on multiple columns</span> |
| |
| <span class="sd"> >>> df.join(df3, ["name", "age"], "outer").sort("name", "age").show()</span> |
| <span class="sd"> +-----+----+------+</span> |
| <span class="sd"> | name| age|height|</span> |
| <span class="sd"> +-----+----+------+</span> |
| <span class="sd"> | NULL|NULL| NULL|</span> |
| <span class="sd"> |Alice| 2| NULL|</span> |
| <span class="sd"> |Alice| 10| 80|</span> |
| <span class="sd"> | Bob| 5| NULL|</span> |
| <span class="sd"> | Tom|NULL| NULL|</span> |
| <span class="sd"> +-----+----+------+</span> |
| |
| <span class="sd"> Left outer join on columns</span> |
| |
| <span class="sd"> >>> df.join(df2, "name", "left_outer").show()</span> |
| <span class="sd"> +-----+---+------+</span> |
| <span class="sd"> | name|age|height|</span> |
| <span class="sd"> +-----+---+------+</span> |
| <span class="sd"> |Alice| 2| NULL|</span> |
| <span class="sd"> | Bob| 5| 85|</span> |
| <span class="sd"> +-----+---+------+</span> |
| |
| <span class="sd"> Right outer join on columns</span> |
| |
| <span class="sd"> >>> df.join(df2, "name", "right_outer").show()</span> |
| <span class="sd"> +----+----+------+</span> |
| <span class="sd"> |name| age|height|</span> |
| <span class="sd"> +----+----+------+</span> |
| <span class="sd"> | Tom|NULL| 80|</span> |
| <span class="sd"> | Bob| 5| 85|</span> |
| <span class="sd"> +----+----+------+</span> |
| |
| <span class="sd"> Left semi join on columns</span> |
| |
| <span class="sd"> >>> df.join(df2, "name", "left_semi").show()</span> |
| <span class="sd"> +----+---+</span> |
| <span class="sd"> |name|age|</span> |
| <span class="sd"> +----+---+</span> |
| <span class="sd"> | Bob| 5|</span> |
| <span class="sd"> +----+---+</span> |
| |
| <span class="sd"> Left anti join on columns</span> |
| |
| <span class="sd"> >>> df.join(df2, "name", "left_anti").show()</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> | name|age|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> |Alice| 2|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="c1"># TODO(SPARK-22947): Fix the DataFrame API.</span> |
| <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">_joinAsOf</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">other</span><span class="p">:</span> <span class="s2">"DataFrame"</span><span class="p">,</span> |
| <span class="n">leftAsOfColumn</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> |
| <span class="n">rightAsOfColumn</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">],</span> |
| <span class="n">on</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Column</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">how</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="o">*</span><span class="p">,</span> |
| <span class="n">tolerance</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Column</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">allowExactMatches</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">True</span><span class="p">,</span> |
| <span class="n">direction</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"backward"</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Perform an as-of join.</span> |
| |
| <span class="sd"> This is similar to a left-join except that we match on the nearest</span> |
| <span class="sd"> key rather than equal keys.</span> |
| |
| <span class="sd"> .. versionchanged:: 4.0.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : :class:`DataFrame`</span> |
| <span class="sd"> Right side of the join</span> |
| <span class="sd"> leftAsOfColumn : str or :class:`Column`</span> |
| <span class="sd"> a string for the as-of join column name, or a Column</span> |
| <span class="sd"> rightAsOfColumn : str or :class:`Column`</span> |
| <span class="sd"> a string for the as-of join column name, or a Column</span> |
| <span class="sd"> on : str, list or :class:`Column`, optional</span> |
| <span class="sd"> a string for the join column name, a list of column names,</span> |
| <span class="sd"> a join expression (Column), or a list of Columns.</span> |
| <span class="sd"> If `on` is a string or a list of strings indicating the name of the join column(s),</span> |
| <span class="sd"> the column(s) must exist on both sides, and this performs an equi-join.</span> |
| <span class="sd"> how : str, optional</span> |
| <span class="sd"> default ``inner``. Must be one of: ``inner`` and ``left``.</span> |
| <span class="sd"> tolerance : :class:`Column`, optional</span> |
| <span class="sd"> an asof tolerance within this range; must be compatible</span> |
| <span class="sd"> with the merge index.</span> |
| <span class="sd"> allowExactMatches : bool, optional</span> |
| <span class="sd"> default ``True``.</span> |
| <span class="sd"> direction : str, optional</span> |
| <span class="sd"> default ``backward``. Must be one of: ``backward``, ``forward``, and ``nearest``.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> The following performs an as-of join between ``left`` and ``right``.</span> |
| |
| <span class="sd"> >>> left = spark.createDataFrame([(1, "a"), (5, "b"), (10, "c")], ["a", "left_val"])</span> |
| <span class="sd"> >>> right = spark.createDataFrame([(1, 1), (2, 2), (3, 3), (6, 6), (7, 7)],</span> |
| <span class="sd"> ... ["a", "right_val"])</span> |
| <span class="sd"> >>> left._joinAsOf(</span> |
| <span class="sd"> ... right, leftAsOfColumn="a", rightAsOfColumn="a"</span> |
| <span class="sd"> ... ).select(left.a, 'left_val', 'right_val').sort("a").collect()</span> |
| <span class="sd"> [Row(a=1, left_val='a', right_val=1),</span> |
| <span class="sd"> Row(a=5, left_val='b', right_val=3),</span> |
| <span class="sd"> Row(a=10, left_val='c', right_val=7)]</span> |
| |
| <span class="sd"> >>> from pyspark.sql import functions as sf</span> |
| <span class="sd"> >>> left._joinAsOf(</span> |
| <span class="sd"> ... right, leftAsOfColumn="a", rightAsOfColumn="a", tolerance=sf.lit(1)</span> |
| <span class="sd"> ... ).select(left.a, 'left_val', 'right_val').sort("a").collect()</span> |
| <span class="sd"> [Row(a=1, left_val='a', right_val=1)]</span> |
| |
| <span class="sd"> >>> left._joinAsOf(</span> |
| <span class="sd"> ... right, leftAsOfColumn="a", rightAsOfColumn="a", how="left", tolerance=sf.lit(1)</span> |
| <span class="sd"> ... ).select(left.a, 'left_val', 'right_val').sort("a").collect()</span> |
| <span class="sd"> [Row(a=1, left_val='a', right_val=1),</span> |
| <span class="sd"> Row(a=5, left_val='b', right_val=None),</span> |
| <span class="sd"> Row(a=10, left_val='c', right_val=None)]</span> |
| |
| <span class="sd"> >>> left._joinAsOf(</span> |
| <span class="sd"> ... right, leftAsOfColumn="a", rightAsOfColumn="a", allowExactMatches=False</span> |
| <span class="sd"> ... ).select(left.a, 'left_val', 'right_val').sort("a").collect()</span> |
| <span class="sd"> [Row(a=5, left_val='b', right_val=3),</span> |
| <span class="sd"> Row(a=10, left_val='c', right_val=7)]</span> |
| |
| <span class="sd"> >>> left._joinAsOf(</span> |
| <span class="sd"> ... right, leftAsOfColumn="a", rightAsOfColumn="a", direction="forward"</span> |
| <span class="sd"> ... ).select(left.a, 'left_val', 'right_val').sort("a").collect()</span> |
| <span class="sd"> [Row(a=1, left_val='a', right_val=1),</span> |
| <span class="sd"> Row(a=5, left_val='b', right_val=6)]</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.sortWithinPartitions"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.sortWithinPartitions.html#pyspark.sql.DataFrame.sortWithinPartitions">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">sortWithinPartitions</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">]]],</span> |
| <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a new :class:`DataFrame` with each partition sorted by the specified column(s).</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : int, str, list or :class:`Column`, optional</span> |
| <span class="sd"> list of :class:`Column` or column names or column ordinals to sort by.</span> |
| |
| <span class="sd"> .. versionchanged:: 4.0.0</span> |
| <span class="sd"> Supports column ordinal.</span> |
| |
| <span class="sd"> Other Parameters</span> |
| <span class="sd"> ----------------</span> |
| <span class="sd"> ascending : bool or list, optional, default True</span> |
| <span class="sd"> boolean or list of boolean.</span> |
| <span class="sd"> Sort ascending vs. descending. Specify list for multiple sort orders.</span> |
| <span class="sd"> If a list is specified, the length of the list must equal the length of the `cols`.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> DataFrame sorted by partitions.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> A column ordinal starts from 1, which is different from the</span> |
| <span class="sd"> 0-based :meth:`__getitem__`.</span> |
| <span class="sd"> If a column ordinal is negative, it means sort descending.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import functions as sf</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| <span class="sd"> >>> df.sortWithinPartitions("age", ascending=False)</span> |
| <span class="sd"> DataFrame[age: bigint, name: string]</span> |
| |
| <span class="sd"> >>> df.coalesce(1).sortWithinPartitions(1).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> >>> df.coalesce(1).sortWithinPartitions(-1).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.sort"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.sort.html#pyspark.sql.DataFrame.sort">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">sort</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">]]],</span> |
| <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a new :class:`DataFrame` sorted by the specified column(s).</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : int, str, list, or :class:`Column`, optional</span> |
| <span class="sd"> list of :class:`Column` or column names or column ordinals to sort by.</span> |
| |
| <span class="sd"> .. versionchanged:: 4.0.0</span> |
| <span class="sd"> Supports column ordinal.</span> |
| |
| <span class="sd"> Other Parameters</span> |
| <span class="sd"> ----------------</span> |
| <span class="sd"> ascending : bool or list, optional, default True</span> |
| <span class="sd"> boolean or list of boolean.</span> |
| <span class="sd"> Sort ascending vs. descending. Specify list for multiple sort orders.</span> |
| <span class="sd"> If a list is specified, the length of the list must equal the length of the `cols`.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Sorted DataFrame.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> A column ordinal starts from 1, which is different from the</span> |
| <span class="sd"> 0-based :meth:`__getitem__`.</span> |
| <span class="sd"> If a column ordinal is negative, it means sort descending.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import functions as sf</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... (2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| |
| <span class="sd"> Sort the DataFrame in ascending order.</span> |
| |
| <span class="sd"> >>> df.sort(sf.asc("age")).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> >>> df.sort(1).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> Sort the DataFrame in descending order.</span> |
| |
| <span class="sd"> >>> df.sort(df.age.desc()).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> >>> df.orderBy(df.age.desc()).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> >>> df.sort("age", ascending=False).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> >>> df.sort(-1).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> Specify multiple columns</span> |
| |
| <span class="sd"> >>> from pyspark.sql import functions as sf</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... (2, "Alice"), (2, "Bob"), (5, "Bob")], schema=["age", "name"])</span> |
| <span class="sd"> >>> df.orderBy(sf.desc("age"), "name").show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 2| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> >>> df.orderBy(-1, "name").show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 2| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> >>> df.orderBy(-1, 2).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 2| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> Specify multiple columns for sorting order at `ascending`.</span> |
| |
| <span class="sd"> >>> df.orderBy(["age", "name"], ascending=[False, False]).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> | 2| Bob|</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> >>> df.orderBy([1, "name"], ascending=[False, False]).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> | 2| Bob|</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> >>> df.orderBy([1, 2], ascending=[False, False]).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> | 2| Bob|</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="n">orderBy</span> <span class="o">=</span> <span class="n">sort</span> |
| |
| <div class="viewcode-block" id="DataFrame.describe"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.describe.html#pyspark.sql.DataFrame.describe">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">describe</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]])</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Computes basic statistics for numeric and string columns.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.1</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> This includes count, mean, stddev, min, and max. If no columns are</span> |
| <span class="sd"> given, this function computes statistics for all numerical or string columns.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This function is meant for exploratory data analysis, as we make no</span> |
| <span class="sd"> guarantee about the backward compatibility of the schema of the resulting</span> |
| <span class="sd"> :class:`DataFrame`.</span> |
| |
| <span class="sd"> Use summary for expanded statistics and control over which statistics to compute.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : str, list, optional</span> |
| <span class="sd"> Column name or list of column names to describe by (default All columns).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> A new DataFrame that describes (provides statistics) given DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [("Bob", 13, 40.3, 150.5), ("Alice", 12, 37.8, 142.3), ("Tom", 11, 44.1, 142.2)],</span> |
| <span class="sd"> ... ["name", "age", "weight", "height"],</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.describe(['age']).show()</span> |
| <span class="sd"> +-------+----+</span> |
| <span class="sd"> |summary| age|</span> |
| <span class="sd"> +-------+----+</span> |
| <span class="sd"> | count| 3|</span> |
| <span class="sd"> | mean|12.0|</span> |
| <span class="sd"> | stddev| 1.0|</span> |
| <span class="sd"> | min| 11|</span> |
| <span class="sd"> | max| 13|</span> |
| <span class="sd"> +-------+----+</span> |
| |
| <span class="sd"> >>> df.describe(['age', 'weight', 'height']).show()</span> |
| <span class="sd"> +-------+----+------------------+-----------------+</span> |
| <span class="sd"> |summary| age| weight| height|</span> |
| <span class="sd"> +-------+----+------------------+-----------------+</span> |
| <span class="sd"> | count| 3| 3| 3|</span> |
| <span class="sd"> | mean|12.0| 40.73333333333333| 145.0|</span> |
| <span class="sd"> | stddev| 1.0|3.1722757341273704|4.763402145525822|</span> |
| <span class="sd"> | min| 11| 37.8| 142.2|</span> |
| <span class="sd"> | max| 13| 44.1| 150.5|</span> |
| <span class="sd"> +-------+----+------------------+-----------------+</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.summary</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.summary"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.summary.html#pyspark.sql.DataFrame.summary">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">summary</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">statistics</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Computes specified statistics for numeric and string columns. Available statistics are:</span> |
| <span class="sd"> - count</span> |
| <span class="sd"> - mean</span> |
| <span class="sd"> - stddev</span> |
| <span class="sd"> - min</span> |
| <span class="sd"> - max</span> |
| <span class="sd"> - arbitrary approximate percentiles specified as a percentage (e.g., 75%)</span> |
| |
| <span class="sd"> If no statistics are given, this function computes count, mean, stddev, min,</span> |
| <span class="sd"> approximate quartiles (percentiles at 25%, 50%, and 75%), and max.</span> |
| |
| <span class="sd"> .. versionadded:: 2.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> statistics : str, optional</span> |
| <span class="sd"> Column names to calculate statistics by (default All columns).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> A new DataFrame that provides statistics for the given DataFrame.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This function is meant for exploratory data analysis, as we make no</span> |
| <span class="sd"> guarantee about the backward compatibility of the schema of the resulting</span> |
| <span class="sd"> :class:`DataFrame`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [("Bob", 13, 40.3, 150.5), ("Alice", 12, 37.8, 142.3), ("Tom", 11, 44.1, 142.2)],</span> |
| <span class="sd"> ... ["name", "age", "weight", "height"],</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.select("age", "weight", "height").summary().show()</span> |
| <span class="sd"> +-------+----+------------------+-----------------+</span> |
| <span class="sd"> |summary| age| weight| height|</span> |
| <span class="sd"> +-------+----+------------------+-----------------+</span> |
| <span class="sd"> | count| 3| 3| 3|</span> |
| <span class="sd"> | mean|12.0| 40.73333333333333| 145.0|</span> |
| <span class="sd"> | stddev| 1.0|3.1722757341273704|4.763402145525822|</span> |
| <span class="sd"> | min| 11| 37.8| 142.2|</span> |
| <span class="sd"> | 25%| 11| 37.8| 142.2|</span> |
| <span class="sd"> | 50%| 12| 40.3| 142.3|</span> |
| <span class="sd"> | 75%| 13| 44.1| 150.5|</span> |
| <span class="sd"> | max| 13| 44.1| 150.5|</span> |
| <span class="sd"> +-------+----+------------------+-----------------+</span> |
| |
| <span class="sd"> >>> df.select("age", "weight", "height").summary("count", "min", "25%", "75%", "max").show()</span> |
| <span class="sd"> +-------+---+------+------+</span> |
| <span class="sd"> |summary|age|weight|height|</span> |
| <span class="sd"> +-------+---+------+------+</span> |
| <span class="sd"> | count| 3| 3| 3|</span> |
| <span class="sd"> | min| 11| 37.8| 142.2|</span> |
| <span class="sd"> | 25%| 11| 37.8| 142.2|</span> |
| <span class="sd"> | 75%| 13| 44.1| 150.5|</span> |
| <span class="sd"> | max| 13| 44.1| 150.5|</span> |
| <span class="sd"> +-------+---+------+------+</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.display</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">head</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="n">Row</span><span class="p">]:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">head</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">Row</span><span class="p">]:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.head"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.head.html#pyspark.sql.DataFrame.head">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">head</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">n</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Optional</span><span class="p">[</span><span class="n">Row</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="n">Row</span><span class="p">]]:</span> |
| <span class="w"> </span><span class="sd">"""Returns the first ``n`` rows.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This method should only be used if the resulting array is expected</span> |
| <span class="sd"> to be small, as all the data is loaded into the driver's memory.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> n : int, optional</span> |
| <span class="sd"> default 1. Number of rows to return.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> If n is supplied, return a list of :class:`Row` of length n</span> |
| <span class="sd"> or less if the DataFrame has fewer elements.</span> |
| <span class="sd"> If n is missing, return a single Row.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... (2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| <span class="sd"> >>> df.head()</span> |
| <span class="sd"> Row(age=2, name='Alice')</span> |
| <span class="sd"> >>> df.head(1)</span> |
| <span class="sd"> [Row(age=2, name='Alice')]</span> |
| <span class="sd"> >>> df.head(0)</span> |
| <span class="sd"> []</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.first"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.first.html#pyspark.sql.DataFrame.first">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">first</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">Optional</span><span class="p">[</span><span class="n">Row</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""Returns the first row as a :class:`Row`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`Row`</span> |
| <span class="sd"> First row if :class:`DataFrame` is not empty, otherwise ``None``.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... (2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| <span class="sd"> >>> df.first()</span> |
| <span class="sd"> Row(age=2, name='Alice')</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.__getitem__"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.__getitem__.html#pyspark.sql.DataFrame.__getitem__">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="fm">__getitem__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">item</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">int</span><span class="p">,</span> <span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">,</span> <span class="n">List</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">])</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="s2">"DataFrame"</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""Returns the column as a :class:`Column`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> item : int, str, :class:`Column`, list or tuple</span> |
| <span class="sd"> column index, column name, column, or a list or tuple of columns</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`Column` or :class:`DataFrame`</span> |
| <span class="sd"> a specified column, or a filtered or projected dataframe.</span> |
| |
| <span class="sd"> * If the input `item` is an int or str, the output is a :class:`Column`.</span> |
| |
| <span class="sd"> * If the input `item` is a :class:`Column`, the output is a :class:`DataFrame`</span> |
| <span class="sd"> filtered by this given :class:`Column`.</span> |
| |
| <span class="sd"> * If the input `item` is a list or tuple, the output is a :class:`DataFrame`</span> |
| <span class="sd"> projected by this given list or tuple.</span> |
| |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... (2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| |
| <span class="sd"> Retrieve a column instance.</span> |
| |
| <span class="sd"> >>> df.select(df['age']).show()</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> |age|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> | 5|</span> |
| <span class="sd"> +---+</span> |
| |
| <span class="sd"> >>> df.select(df[1]).show()</span> |
| <span class="sd"> +-----+</span> |
| <span class="sd"> | name|</span> |
| <span class="sd"> +-----+</span> |
| <span class="sd"> |Alice|</span> |
| <span class="sd"> | Bob|</span> |
| <span class="sd"> +-----+</span> |
| |
| <span class="sd"> Select multiple string columns as index.</span> |
| |
| <span class="sd"> >>> df[["name", "age"]].show()</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> | name|age|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> |Alice| 2|</span> |
| <span class="sd"> | Bob| 5|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> >>> df[df.age > 3].show()</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> |age|name|</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> >>> df[df[0] > 3].show()</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> |age|name|</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.__getattr__"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.__getattr__.html#pyspark.sql.DataFrame.__getattr__">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="fm">__getattr__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">name</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns the :class:`Column` denoted by ``name``.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> name : str</span> |
| <span class="sd"> Column name to return as :class:`Column`.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`Column`</span> |
| <span class="sd"> Requested column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... (2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| |
| <span class="sd"> Retrieve a column instance.</span> |
| |
| <span class="sd"> >>> df.select(df.age).show()</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> |age|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> | 5|</span> |
| <span class="sd"> +---+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="fm">__dir__</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.functions import lit</span> |
| |
| <span class="sd"> Create a dataframe with a column named 'id'.</span> |
| |
| <span class="sd"> >>> df = spark.range(3)</span> |
| <span class="sd"> >>> [attr for attr in dir(df) if attr[0] == 'i'][:7] # Includes column id</span> |
| <span class="sd"> ['id', 'inputFiles', 'intersect', 'intersectAll', 'isEmpty', 'isLocal', 'isStreaming']</span> |
| |
| <span class="sd"> Add a column named 'i_like_pancakes'.</span> |
| |
| <span class="sd"> >>> df = df.withColumn('i_like_pancakes', lit(1))</span> |
| <span class="sd"> >>> [attr for attr in dir(df) if attr[0] == 'i'][:7] # Includes columns i_like_pancakes, id</span> |
| <span class="sd"> ['i_like_pancakes', 'id', 'inputFiles', 'intersect', 'intersectAll', 'isEmpty', 'isLocal']</span> |
| |
| <span class="sd"> Try to add an existed column 'inputFiles'.</span> |
| |
| <span class="sd"> >>> df = df.withColumn('inputFiles', lit(2))</span> |
| <span class="sd"> >>> [attr for attr in dir(df) if attr[0] == 'i'][:7] # Doesn't duplicate inputFiles</span> |
| <span class="sd"> ['i_like_pancakes', 'id', 'inputFiles', 'intersect', 'intersectAll', 'isEmpty', 'isLocal']</span> |
| |
| <span class="sd"> Try to add a column named 'id2'.</span> |
| |
| <span class="sd"> >>> df = df.withColumn('id2', lit(3))</span> |
| <span class="sd"> >>> [attr for attr in dir(df) if attr[0] == 'i'][:7] # result includes id2 and sorted</span> |
| <span class="sd"> ['i_like_pancakes', 'id', 'id2', 'inputFiles', 'intersect', 'intersectAll', 'isEmpty']</span> |
| |
| <span class="sd"> Don't include columns that are not valid python identifiers.</span> |
| |
| <span class="sd"> >>> df = df.withColumn('1', lit(4))</span> |
| <span class="sd"> >>> df = df.withColumn('name 1', lit(5))</span> |
| <span class="sd"> >>> [attr for attr in dir(df) if attr[0] == 'i'][:7] # Doesn't include 1 or name 1</span> |
| <span class="sd"> ['i_like_pancakes', 'id', 'id2', 'inputFiles', 'intersect', 'intersectAll', 'isEmpty']</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">select</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">select</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">__cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]])</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.select"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.select.html#pyspark.sql.DataFrame.select">[docs]</a> <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="k">def</span> <span class="nf">select</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Projects a set of expressions and returns a new :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : str, :class:`Column`, or list</span> |
| <span class="sd"> column names (string) or expressions (:class:`Column`).</span> |
| <span class="sd"> If one of the column names is '*', that column is expanded to include all columns</span> |
| <span class="sd"> in the current :class:`DataFrame`.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> A DataFrame with subset (or all) of columns.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... (2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| |
| <span class="sd"> Select all columns in the DataFrame.</span> |
| |
| <span class="sd"> >>> df.select('*').show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> Select a column with other expressions in the DataFrame.</span> |
| |
| <span class="sd"> >>> df.select(df.name, (df.age + 10).alias('age')).show()</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> | name|age|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> |Alice| 12|</span> |
| <span class="sd"> | Bob| 15|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">selectExpr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">expr</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">selectExpr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">expr</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.selectExpr"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.selectExpr.html#pyspark.sql.DataFrame.selectExpr">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">selectExpr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">expr</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]])</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Projects a set of SQL expressions and returns a new :class:`DataFrame`.</span> |
| |
| <span class="sd"> This is a variant of :func:`select` that accepts SQL expressions.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> A DataFrame with new/old columns transformed by expressions.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... (2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| <span class="sd"> >>> df.selectExpr("age * 2", "abs(age)").show()</span> |
| <span class="sd"> +---------+--------+</span> |
| <span class="sd"> |(age * 2)|abs(age)|</span> |
| <span class="sd"> +---------+--------+</span> |
| <span class="sd"> | 4| 2|</span> |
| <span class="sd"> | 10| 5|</span> |
| <span class="sd"> +---------+--------+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.filter"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.filter.html#pyspark.sql.DataFrame.filter">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">filter</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">condition</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Filters rows using the given condition.</span> |
| |
| <span class="sd"> :func:`where` is an alias for :func:`filter`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> condition : :class:`Column` or str</span> |
| <span class="sd"> A :class:`Column` of :class:`types.BooleanType`</span> |
| <span class="sd"> or a string of SQL expressions.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> A new DataFrame with rows that satisfy the condition.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... (2, "Alice", "Math"), (5, "Bob", "Physics"), (7, "Charlie", "Chemistry")],</span> |
| <span class="sd"> ... schema=["age", "name", "subject"])</span> |
| |
| <span class="sd"> Filter by :class:`Column` instances.</span> |
| |
| <span class="sd"> >>> df.filter(df.age > 3).show()</span> |
| <span class="sd"> +---+-------+---------+</span> |
| <span class="sd"> |age| name| subject|</span> |
| <span class="sd"> +---+-------+---------+</span> |
| <span class="sd"> | 5| Bob| Physics|</span> |
| <span class="sd"> | 7|Charlie|Chemistry|</span> |
| <span class="sd"> +---+-------+---------+</span> |
| <span class="sd"> >>> df.where(df.age == 2).show()</span> |
| <span class="sd"> +---+-----+-------+</span> |
| <span class="sd"> |age| name|subject|</span> |
| <span class="sd"> +---+-----+-------+</span> |
| <span class="sd"> | 2|Alice| Math|</span> |
| <span class="sd"> +---+-----+-------+</span> |
| |
| <span class="sd"> Filter by SQL expression in a string.</span> |
| |
| <span class="sd"> >>> df.filter("age > 3").show()</span> |
| <span class="sd"> +---+-------+---------+</span> |
| <span class="sd"> |age| name| subject|</span> |
| <span class="sd"> +---+-------+---------+</span> |
| <span class="sd"> | 5| Bob| Physics|</span> |
| <span class="sd"> | 7|Charlie|Chemistry|</span> |
| <span class="sd"> +---+-------+---------+</span> |
| <span class="sd"> >>> df.where("age = 2").show()</span> |
| <span class="sd"> +---+-----+-------+</span> |
| <span class="sd"> |age| name|subject|</span> |
| <span class="sd"> +---+-----+-------+</span> |
| <span class="sd"> | 2|Alice| Math|</span> |
| <span class="sd"> +---+-----+-------+</span> |
| |
| <span class="sd"> Filter by multiple conditions.</span> |
| |
| <span class="sd"> >>> df.filter((df.age > 3) & (df.subject == "Physics")).show()</span> |
| <span class="sd"> +---+----+-------+</span> |
| <span class="sd"> |age|name|subject|</span> |
| <span class="sd"> +---+----+-------+</span> |
| <span class="sd"> | 5| Bob|Physics|</span> |
| <span class="sd"> +---+----+-------+</span> |
| <span class="sd"> >>> df.filter((df.age == 2) | (df.subject == "Chemistry")).show()</span> |
| <span class="sd"> +---+-------+---------+</span> |
| <span class="sd"> |age| name| subject|</span> |
| <span class="sd"> +---+-------+---------+</span> |
| <span class="sd"> | 2| Alice| Math|</span> |
| <span class="sd"> | 7|Charlie|Chemistry|</span> |
| <span class="sd"> +---+-------+---------+</span> |
| |
| <span class="sd"> Filter by multiple conditions using SQL expression.</span> |
| |
| <span class="sd"> >>> df.filter("age > 3 AND name = 'Bob'").show()</span> |
| <span class="sd"> +---+----+-------+</span> |
| <span class="sd"> |age|name|subject|</span> |
| <span class="sd"> +---+----+-------+</span> |
| <span class="sd"> | 5| Bob|Physics|</span> |
| <span class="sd"> +---+----+-------+</span> |
| |
| <span class="sd"> Filter using the :func:`Column.isin` function.</span> |
| |
| <span class="sd"> >>> df.filter(df.name.isin("Alice", "Bob")).show()</span> |
| <span class="sd"> +---+-----+-------+</span> |
| <span class="sd"> |age| name|subject|</span> |
| <span class="sd"> +---+-----+-------+</span> |
| <span class="sd"> | 2|Alice| Math|</span> |
| <span class="sd"> | 5| Bob|Physics|</span> |
| <span class="sd"> +---+-----+-------+</span> |
| |
| <span class="sd"> Filter by a list of values using the :func:`Column.isin` function.</span> |
| |
| <span class="sd"> >>> df.filter(df.subject.isin(["Math", "Physics"])).show()</span> |
| <span class="sd"> +---+-----+-------+</span> |
| <span class="sd"> |age| name|subject|</span> |
| <span class="sd"> +---+-----+-------+</span> |
| <span class="sd"> | 2|Alice| Math|</span> |
| <span class="sd"> | 5| Bob|Physics|</span> |
| <span class="sd"> +---+-----+-------+</span> |
| |
| <span class="sd"> Filter using the `~` operator to exclude certain values.</span> |
| |
| <span class="sd"> >>> df.filter(~df.name.isin(["Alice", "Charlie"])).show()</span> |
| <span class="sd"> +---+----+-------+</span> |
| <span class="sd"> |age|name|subject|</span> |
| <span class="sd"> +---+----+-------+</span> |
| <span class="sd"> | 5| Bob|Physics|</span> |
| <span class="sd"> +---+----+-------+</span> |
| |
| <span class="sd"> Filter using the :func:`Column.isNotNull` function.</span> |
| |
| <span class="sd"> >>> df.filter(df.name.isNotNull()).show()</span> |
| <span class="sd"> +---+-------+---------+</span> |
| <span class="sd"> |age| name| subject|</span> |
| <span class="sd"> +---+-------+---------+</span> |
| <span class="sd"> | 2| Alice| Math|</span> |
| <span class="sd"> | 5| Bob| Physics|</span> |
| <span class="sd"> | 7|Charlie|Chemistry|</span> |
| <span class="sd"> +---+-------+---------+</span> |
| |
| <span class="sd"> Filter using the :func:`Column.like` function.</span> |
| |
| <span class="sd"> >>> df.filter(df.name.like("Al%")).show()</span> |
| <span class="sd"> +---+-----+-------+</span> |
| <span class="sd"> |age| name|subject|</span> |
| <span class="sd"> +---+-----+-------+</span> |
| <span class="sd"> | 2|Alice| Math|</span> |
| <span class="sd"> +---+-----+-------+</span> |
| |
| <span class="sd"> Filter using the :func:`Column.contains` function.</span> |
| |
| <span class="sd"> >>> df.filter(df.name.contains("i")).show()</span> |
| <span class="sd"> +---+-------+---------+</span> |
| <span class="sd"> |age| name| subject|</span> |
| <span class="sd"> +---+-------+---------+</span> |
| <span class="sd"> | 2| Alice| Math|</span> |
| <span class="sd"> | 7|Charlie|Chemistry|</span> |
| <span class="sd"> +---+-------+---------+</span> |
| |
| <span class="sd"> Filter using the :func:`Column.between` function.</span> |
| |
| <span class="sd"> >>> df.filter(df.age.between(2, 5)).show()</span> |
| <span class="sd"> +---+-----+-------+</span> |
| <span class="sd"> |age| name|subject|</span> |
| <span class="sd"> +---+-----+-------+</span> |
| <span class="sd"> | 2|Alice| Math|</span> |
| <span class="sd"> | 5| Bob|Physics|</span> |
| <span class="sd"> +---+-----+-------+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">groupBy</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrNameOrOrdinal"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GroupedData"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">groupBy</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">__cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]])</span> <span class="o">-></span> <span class="s2">"GroupedData"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.groupBy"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.groupBy.html#pyspark.sql.DataFrame.groupBy">[docs]</a> <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="k">def</span> <span class="nf">groupBy</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrNameOrOrdinal"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GroupedData"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Groups the :class:`DataFrame` by the specified columns so that aggregation</span> |
| <span class="sd"> can be performed on them.</span> |
| <span class="sd"> See :class:`GroupedData` for all the available aggregate functions.</span> |
| |
| <span class="sd"> :func:`groupby` is an alias for :func:`groupBy`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : list, str, int or :class:`Column`</span> |
| <span class="sd"> The columns to group by.</span> |
| <span class="sd"> Each element can be a column name (string) or an expression (:class:`Column`)</span> |
| <span class="sd"> or a column ordinal (int, 1-based) or list of them.</span> |
| |
| <span class="sd"> .. versionchanged:: 4.0.0</span> |
| <span class="sd"> Supports column ordinal.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`GroupedData`</span> |
| <span class="sd"> A :class:`GroupedData` object representing the grouped data by the specified columns.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> A column ordinal starts from 1, which is different from the</span> |
| <span class="sd"> 0-based :meth:`__getitem__`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... ("Alice", 2), ("Bob", 2), ("Bob", 2), ("Bob", 5)], schema=["name", "age"])</span> |
| |
| <span class="sd"> Example 1: Empty grouping columns triggers a global aggregation.</span> |
| |
| <span class="sd"> >>> df.groupBy().avg().show()</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> |avg(age)|</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> | 2.75|</span> |
| <span class="sd"> +--------+</span> |
| |
| <span class="sd"> Example 2: Group-by 'name', and specify a dictionary to calculate the summation of 'age'.</span> |
| |
| <span class="sd"> >>> df.groupBy("name").agg({"age": "sum"}).sort("name").show()</span> |
| <span class="sd"> +-----+--------+</span> |
| <span class="sd"> | name|sum(age)|</span> |
| <span class="sd"> +-----+--------+</span> |
| <span class="sd"> |Alice| 2|</span> |
| <span class="sd"> | Bob| 9|</span> |
| <span class="sd"> +-----+--------+</span> |
| |
| <span class="sd"> Example 3: Group-by 'name', and calculate maximum values.</span> |
| |
| <span class="sd"> >>> df.groupBy(df.name).max().sort("name").show()</span> |
| <span class="sd"> +-----+--------+</span> |
| <span class="sd"> | name|max(age)|</span> |
| <span class="sd"> +-----+--------+</span> |
| <span class="sd"> |Alice| 2|</span> |
| <span class="sd"> | Bob| 5|</span> |
| <span class="sd"> +-----+--------+</span> |
| |
| <span class="sd"> Example 4: Also group-by 'name', but using the column ordinal.</span> |
| |
| <span class="sd"> >>> df.groupBy(1).max().sort("name").show()</span> |
| <span class="sd"> +-----+--------+</span> |
| <span class="sd"> | name|max(age)|</span> |
| <span class="sd"> +-----+--------+</span> |
| <span class="sd"> |Alice| 2|</span> |
| <span class="sd"> | Bob| 5|</span> |
| <span class="sd"> +-----+--------+</span> |
| |
| <span class="sd"> Example 5: Group-by 'name' and 'age', and calculate the number of rows in each group.</span> |
| |
| <span class="sd"> >>> df.groupBy(["name", df.age]).count().sort("name", "age").show()</span> |
| <span class="sd"> +-----+---+-----+</span> |
| <span class="sd"> | name|age|count|</span> |
| <span class="sd"> +-----+---+-----+</span> |
| <span class="sd"> |Alice| 2| 1|</span> |
| <span class="sd"> | Bob| 2| 2|</span> |
| <span class="sd"> | Bob| 5| 1|</span> |
| <span class="sd"> +-----+---+-----+</span> |
| |
| <span class="sd"> Example 6: Also Group-by 'name' and 'age', but using the column ordinal.</span> |
| |
| <span class="sd"> >>> df.groupBy([df.name, 2]).count().sort("name", "age").show()</span> |
| <span class="sd"> +-----+---+-----+</span> |
| <span class="sd"> | name|age|count|</span> |
| <span class="sd"> +-----+---+-----+</span> |
| <span class="sd"> |Alice| 2| 1|</span> |
| <span class="sd"> | Bob| 2| 2|</span> |
| <span class="sd"> | Bob| 5| 1|</span> |
| <span class="sd"> +-----+---+-----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">rollup</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GroupedData"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">rollup</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">__cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]])</span> <span class="o">-></span> <span class="s2">"GroupedData"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.rollup"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.rollup.html#pyspark.sql.DataFrame.rollup">[docs]</a> <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="k">def</span> <span class="nf">rollup</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrNameOrOrdinal"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GroupedData"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Create a multi-dimensional rollup for the current :class:`DataFrame` using</span> |
| <span class="sd"> the specified columns, allowing for aggregation on them.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : list, str, int or :class:`Column`</span> |
| <span class="sd"> The columns to roll-up by.</span> |
| <span class="sd"> Each element should be a column name (string) or an expression (:class:`Column`)</span> |
| <span class="sd"> or a column ordinal (int, 1-based) or list of them.</span> |
| |
| <span class="sd"> .. versionchanged:: 4.0.0</span> |
| <span class="sd"> Supports column ordinal.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`GroupedData`</span> |
| <span class="sd"> Rolled-up data based on the specified columns.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> A column ordinal starts from 1, which is different from the</span> |
| <span class="sd"> 0-based :meth:`__getitem__`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], schema=["name", "age"])</span> |
| |
| <span class="sd"> Example 1: Rollup-by 'name', and calculate the number of rows in each dimensional.</span> |
| |
| <span class="sd"> >>> df.rollup("name").count().orderBy("name").show()</span> |
| <span class="sd"> +-----+-----+</span> |
| <span class="sd"> | name|count|</span> |
| <span class="sd"> +-----+-----+</span> |
| <span class="sd"> | NULL| 2|</span> |
| <span class="sd"> |Alice| 1|</span> |
| <span class="sd"> | Bob| 1|</span> |
| <span class="sd"> +-----+-----+</span> |
| |
| <span class="sd"> Example 2: Rollup-by 'name' and 'age',</span> |
| <span class="sd"> and calculate the number of rows in each dimensional.</span> |
| |
| <span class="sd"> >>> df.rollup("name", df.age).count().orderBy("name", "age").show()</span> |
| <span class="sd"> +-----+----+-----+</span> |
| <span class="sd"> | name| age|count|</span> |
| <span class="sd"> +-----+----+-----+</span> |
| <span class="sd"> | NULL|NULL| 2|</span> |
| <span class="sd"> |Alice|NULL| 1|</span> |
| <span class="sd"> |Alice| 2| 1|</span> |
| <span class="sd"> | Bob|NULL| 1|</span> |
| <span class="sd"> | Bob| 5| 1|</span> |
| <span class="sd"> +-----+----+-----+</span> |
| |
| <span class="sd"> Example 3: Also Rollup-by 'name' and 'age', but using the column ordinal.</span> |
| |
| <span class="sd"> >>> df.rollup(1, 2).count().orderBy(1, 2).show()</span> |
| <span class="sd"> +-----+----+-----+</span> |
| <span class="sd"> | name| age|count|</span> |
| <span class="sd"> +-----+----+-----+</span> |
| <span class="sd"> | NULL|NULL| 2|</span> |
| <span class="sd"> |Alice|NULL| 1|</span> |
| <span class="sd"> |Alice| 2| 1|</span> |
| <span class="sd"> | Bob|NULL| 1|</span> |
| <span class="sd"> | Bob| 5| 1|</span> |
| <span class="sd"> +-----+----+-----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">cube</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GroupedData"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">cube</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">__cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]])</span> <span class="o">-></span> <span class="s2">"GroupedData"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.cube"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.cube.html#pyspark.sql.DataFrame.cube">[docs]</a> <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="k">def</span> <span class="nf">cube</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GroupedData"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Create a multi-dimensional cube for the current :class:`DataFrame` using</span> |
| <span class="sd"> the specified columns, allowing aggregations to be performed on them.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : list, str, int or :class:`Column`</span> |
| <span class="sd"> The columns to cube by.</span> |
| <span class="sd"> Each element should be a column name (string) or an expression (:class:`Column`)</span> |
| <span class="sd"> or a column ordinal (int, 1-based) or list of them.</span> |
| |
| <span class="sd"> .. versionchanged:: 4.0.0</span> |
| <span class="sd"> Supports column ordinal.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`GroupedData`</span> |
| <span class="sd"> Cube of the data based on the specified columns.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> A column ordinal starts from 1, which is different from the</span> |
| <span class="sd"> 0-based :meth:`__getitem__`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([("Alice", 2), ("Bob", 5)], schema=["name", "age"])</span> |
| |
| <span class="sd"> Example 1: Creating a cube on 'name',</span> |
| <span class="sd"> and calculate the number of rows in each dimensional.</span> |
| |
| <span class="sd"> >>> df.cube("name").count().orderBy("name").show()</span> |
| <span class="sd"> +-----+-----+</span> |
| <span class="sd"> | name|count|</span> |
| <span class="sd"> +-----+-----+</span> |
| <span class="sd"> | NULL| 2|</span> |
| <span class="sd"> |Alice| 1|</span> |
| <span class="sd"> | Bob| 1|</span> |
| <span class="sd"> +-----+-----+</span> |
| |
| <span class="sd"> Example 2: Creating a cube on 'name' and 'age',</span> |
| <span class="sd"> and calculate the number of rows in each dimensional.</span> |
| |
| <span class="sd"> >>> df.cube("name", df.age).count().orderBy("name", "age").show()</span> |
| <span class="sd"> +-----+----+-----+</span> |
| <span class="sd"> | name| age|count|</span> |
| <span class="sd"> +-----+----+-----+</span> |
| <span class="sd"> | NULL|NULL| 2|</span> |
| <span class="sd"> | NULL| 2| 1|</span> |
| <span class="sd"> | NULL| 5| 1|</span> |
| <span class="sd"> |Alice|NULL| 1|</span> |
| <span class="sd"> |Alice| 2| 1|</span> |
| <span class="sd"> | Bob|NULL| 1|</span> |
| <span class="sd"> | Bob| 5| 1|</span> |
| <span class="sd"> +-----+----+-----+</span> |
| |
| <span class="sd"> Example 3: Also creating a cube on 'name' and 'age', but using the column ordinal.</span> |
| |
| <span class="sd"> >>> df.cube(1, 2).count().orderBy(1, 2).show()</span> |
| <span class="sd"> +-----+----+-----+</span> |
| <span class="sd"> | name| age|count|</span> |
| <span class="sd"> +-----+----+-----+</span> |
| <span class="sd"> | NULL|NULL| 2|</span> |
| <span class="sd"> | NULL| 2| 1|</span> |
| <span class="sd"> | NULL| 5| 1|</span> |
| <span class="sd"> |Alice|NULL| 1|</span> |
| <span class="sd"> |Alice| 2| 1|</span> |
| <span class="sd"> | Bob|NULL| 1|</span> |
| <span class="sd"> | Bob| 5| 1|</span> |
| <span class="sd"> +-----+----+-----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.groupingSets"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.groupingSets.html#pyspark.sql.DataFrame.groupingSets">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">groupingSets</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">groupingSets</span><span class="p">:</span> <span class="n">Sequence</span><span class="p">[</span><span class="n">Sequence</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">]],</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"GroupedData"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Create multi-dimensional aggregation for the current `class`:DataFrame using the specified</span> |
| <span class="sd"> grouping sets, so we can run aggregation on them.</span> |
| |
| <span class="sd"> .. versionadded:: 4.0.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> groupingSets : sequence of sequence of columns or str</span> |
| <span class="sd"> Individual set of columns to group on.</span> |
| <span class="sd"> cols : :class:`Column` or str</span> |
| <span class="sd"> Addional grouping columns specified by users.</span> |
| <span class="sd"> Those columns are shown as the output columns after aggregation.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`GroupedData`</span> |
| <span class="sd"> Grouping sets of the data based on the specified columns.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Example 1: Group by city and car_model, city, and all, and calculate the sum of quantity.</span> |
| |
| <span class="sd"> >>> from pyspark.sql import functions as sf</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... (100, 'Fremont', 'Honda Civic', 10),</span> |
| <span class="sd"> ... (100, 'Fremont', 'Honda Accord', 15),</span> |
| <span class="sd"> ... (100, 'Fremont', 'Honda CRV', 7),</span> |
| <span class="sd"> ... (200, 'Dublin', 'Honda Civic', 20),</span> |
| <span class="sd"> ... (200, 'Dublin', 'Honda Accord', 10),</span> |
| <span class="sd"> ... (200, 'Dublin', 'Honda CRV', 3),</span> |
| <span class="sd"> ... (300, 'San Jose', 'Honda Civic', 5),</span> |
| <span class="sd"> ... (300, 'San Jose', 'Honda Accord', 8)</span> |
| <span class="sd"> ... ], schema="id INT, city STRING, car_model STRING, quantity INT")</span> |
| |
| <span class="sd"> >>> df.groupingSets(</span> |
| <span class="sd"> ... [("city", "car_model"), ("city",), ()],</span> |
| <span class="sd"> ... "city", "car_model"</span> |
| <span class="sd"> ... ).agg(sf.sum(sf.col("quantity")).alias("sum")).sort("city", "car_model").show()</span> |
| <span class="sd"> +--------+------------+---+</span> |
| <span class="sd"> | city| car_model|sum|</span> |
| <span class="sd"> +--------+------------+---+</span> |
| <span class="sd"> | NULL| NULL| 78|</span> |
| <span class="sd"> | Dublin| NULL| 33|</span> |
| <span class="sd"> | Dublin|Honda Accord| 10|</span> |
| <span class="sd"> | Dublin| Honda CRV| 3|</span> |
| <span class="sd"> | Dublin| Honda Civic| 20|</span> |
| <span class="sd"> | Fremont| NULL| 32|</span> |
| <span class="sd"> | Fremont|Honda Accord| 15|</span> |
| <span class="sd"> | Fremont| Honda CRV| 7|</span> |
| <span class="sd"> | Fremont| Honda Civic| 10|</span> |
| <span class="sd"> |San Jose| NULL| 13|</span> |
| <span class="sd"> |San Jose|Honda Accord| 8|</span> |
| <span class="sd"> |San Jose| Honda Civic| 5|</span> |
| <span class="sd"> +--------+------------+---+</span> |
| |
| <span class="sd"> Example 2: Group by multiple columns and calculate both average and sum.</span> |
| |
| <span class="sd"> >>> df.groupingSets(</span> |
| <span class="sd"> ... [("city", "car_model"), ("city",), ()],</span> |
| <span class="sd"> ... "city", "car_model"</span> |
| <span class="sd"> ... ).agg(</span> |
| <span class="sd"> ... sf.avg(sf.col("quantity")).alias("avg_quantity"),</span> |
| <span class="sd"> ... sf.sum(sf.col("quantity")).alias("sum_quantity")</span> |
| <span class="sd"> ... ).sort("city", "car_model").show()</span> |
| <span class="sd"> +--------+------------+------------------+------------+</span> |
| <span class="sd"> | city| car_model| avg_quantity|sum_quantity|</span> |
| <span class="sd"> +--------+------------+------------------+------------+</span> |
| <span class="sd"> | NULL| NULL| 9.75| 78|</span> |
| <span class="sd"> | Dublin| NULL| 11.0| 33|</span> |
| <span class="sd"> | Dublin|Honda Accord| 10.0| 10|</span> |
| <span class="sd"> | Dublin| Honda CRV| 3.0| 3|</span> |
| <span class="sd"> | Dublin| Honda Civic| 20.0| 20|</span> |
| <span class="sd"> | Fremont| NULL|10.666666666666666| 32|</span> |
| <span class="sd"> | Fremont|Honda Accord| 15.0| 15|</span> |
| <span class="sd"> | Fremont| Honda CRV| 7.0| 7|</span> |
| <span class="sd"> | Fremont| Honda Civic| 10.0| 10|</span> |
| <span class="sd"> |San Jose| NULL| 6.5| 13|</span> |
| <span class="sd"> |San Jose|Honda Accord| 8.0| 8|</span> |
| <span class="sd"> |San Jose| Honda Civic| 5.0| 5|</span> |
| <span class="sd"> +--------+------------+------------------+------------+</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> GroupedData</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.unpivot"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.unpivot.html#pyspark.sql.DataFrame.unpivot">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">unpivot</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">ids</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="o">...</span><span class="p">]],</span> |
| <span class="n">values</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="o">...</span><span class="p">]]],</span> |
| <span class="n">variableColumnName</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="n">valueColumnName</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Unpivot a DataFrame from wide format to long format, optionally leaving</span> |
| <span class="sd"> identifier columns set. This is the reverse to `groupBy(...).pivot(...).agg(...)`,</span> |
| <span class="sd"> except for the aggregation, which cannot be reversed.</span> |
| |
| <span class="sd"> This function is useful to massage a DataFrame into a format where some</span> |
| <span class="sd"> columns are identifier columns ("ids"), while all other columns ("values")</span> |
| <span class="sd"> are "unpivoted" to the rows, leaving just two non-id columns, named as given</span> |
| <span class="sd"> by `variableColumnName` and `valueColumnName`.</span> |
| |
| <span class="sd"> When no "id" columns are given, the unpivoted DataFrame consists of only the</span> |
| <span class="sd"> "variable" and "value" columns.</span> |
| |
| <span class="sd"> The `values` columns must not be empty so at least one value must be given to be unpivoted.</span> |
| <span class="sd"> When `values` is `None`, all non-id columns will be unpivoted.</span> |
| |
| <span class="sd"> All "value" columns must share a least common data type. Unless they are the same data type,</span> |
| <span class="sd"> all "value" columns are cast to the nearest common data type. For instance, types</span> |
| <span class="sd"> `IntegerType` and `LongType` are cast to `LongType`, while `IntegerType` and `StringType`</span> |
| <span class="sd"> do not have a common data type and `unpivot` fails.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> ids : str, Column, tuple, list</span> |
| <span class="sd"> Column(s) to use as identifiers. Can be a single column or column name,</span> |
| <span class="sd"> or a list or tuple for multiple columns.</span> |
| <span class="sd"> values : str, Column, tuple, list, optional</span> |
| <span class="sd"> Column(s) to unpivot. Can be a single column or column name, or a list or tuple</span> |
| <span class="sd"> for multiple columns. If specified, must not be empty. If not specified, uses all</span> |
| <span class="sd"> columns that are not set as `ids`.</span> |
| <span class="sd"> variableColumnName : str</span> |
| <span class="sd"> Name of the variable column.</span> |
| <span class="sd"> valueColumnName : str</span> |
| <span class="sd"> Name of the value column.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Unpivoted DataFrame.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(1, 11, 1.1), (2, 12, 1.2)],</span> |
| <span class="sd"> ... ["id", "int", "double"],</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> df.show()</span> |
| <span class="sd"> +---+---+------+</span> |
| <span class="sd"> | id|int|double|</span> |
| <span class="sd"> +---+---+------+</span> |
| <span class="sd"> | 1| 11| 1.1|</span> |
| <span class="sd"> | 2| 12| 1.2|</span> |
| <span class="sd"> +---+---+------+</span> |
| |
| <span class="sd"> >>> df.unpivot("id", ["int", "double"], "var", "val").show()</span> |
| <span class="sd"> +---+------+----+</span> |
| <span class="sd"> | id| var| val|</span> |
| <span class="sd"> +---+------+----+</span> |
| <span class="sd"> | 1| int|11.0|</span> |
| <span class="sd"> | 1|double| 1.1|</span> |
| <span class="sd"> | 2| int|12.0|</span> |
| <span class="sd"> | 2|double| 1.2|</span> |
| <span class="sd"> +---+------+----+</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.melt</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.melt"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.melt.html#pyspark.sql.DataFrame.melt">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">melt</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">ids</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="o">...</span><span class="p">]],</span> |
| <span class="n">values</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="s2">"ColumnOrName"</span><span class="p">,</span> <span class="o">...</span><span class="p">]]],</span> |
| <span class="n">variableColumnName</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="n">valueColumnName</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Unpivot a DataFrame from wide format to long format, optionally leaving</span> |
| <span class="sd"> identifier columns set. This is the reverse to `groupBy(...).pivot(...).agg(...)`,</span> |
| <span class="sd"> except for the aggregation, which cannot be reversed.</span> |
| |
| <span class="sd"> :func:`melt` is an alias for :func:`unpivot`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> ids : str, Column, tuple, list, optional</span> |
| <span class="sd"> Column(s) to use as identifiers. Can be a single column or column name,</span> |
| <span class="sd"> or a list or tuple for multiple columns.</span> |
| <span class="sd"> values : str, Column, tuple, list, optional</span> |
| <span class="sd"> Column(s) to unpivot. Can be a single column or column name, or a list or tuple</span> |
| <span class="sd"> for multiple columns. If not specified or empty, use all columns that</span> |
| <span class="sd"> are not set as `ids`.</span> |
| <span class="sd"> variableColumnName : str</span> |
| <span class="sd"> Name of the variable column.</span> |
| <span class="sd"> valueColumnName : str</span> |
| <span class="sd"> Name of the value column.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Unpivoted DataFrame.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.unpivot</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.agg"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.agg.html#pyspark.sql.DataFrame.agg">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">agg</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">exprs</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">Column</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">]])</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Aggregate on the entire :class:`DataFrame` without groups</span> |
| <span class="sd"> (shorthand for ``df.groupBy().agg()``).</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> exprs : :class:`Column` or dict of key and value strings</span> |
| <span class="sd"> Columns or expressions to aggregate DataFrame by.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Aggregated DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import functions as sf</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| <span class="sd"> >>> df.agg({"age": "max"}).show()</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> |max(age)|</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> | 5|</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> >>> df.agg(sf.min(df.age)).show()</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> |min(age)|</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> | 2|</span> |
| <span class="sd"> +--------+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.observe"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.observe.html#pyspark.sql.DataFrame.observe">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">observe</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">observation</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"Observation"</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> |
| <span class="o">*</span><span class="n">exprs</span><span class="p">:</span> <span class="n">Column</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Define (named) metrics to observe on the DataFrame. This method returns an 'observed'</span> |
| <span class="sd"> DataFrame that returns the same result as the input, with the following guarantees:</span> |
| |
| <span class="sd"> * It will compute the defined aggregates (metrics) on all the data that is flowing through</span> |
| <span class="sd"> the Dataset at that point.</span> |
| |
| <span class="sd"> * It will report the value of the defined aggregate columns as soon as we reach a completion</span> |
| <span class="sd"> point. A completion point is either the end of a query (batch mode) or the end of a</span> |
| <span class="sd"> streaming epoch. The value of the aggregates only reflects the data processed since</span> |
| <span class="sd"> the previous completion point.</span> |
| |
| <span class="sd"> The metrics columns must either contain a literal (e.g. lit(42)), or should contain one or</span> |
| <span class="sd"> more aggregate functions (e.g. sum(a) or sum(a + b) + avg(c) - lit(1)). Expressions that</span> |
| <span class="sd"> contain references to the input Dataset's columns must always be wrapped in an aggregate</span> |
| <span class="sd"> function.</span> |
| |
| <span class="sd"> A user can observe these metrics by adding</span> |
| <span class="sd"> Python's :class:`~pyspark.sql.streaming.StreamingQueryListener`,</span> |
| <span class="sd"> Scala/Java's ``org.apache.spark.sql.streaming.StreamingQueryListener`` or Scala/Java's</span> |
| <span class="sd"> ``org.apache.spark.sql.util.QueryExecutionListener`` to the spark session.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.5.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> observation : :class:`Observation` or str</span> |
| <span class="sd"> `str` to specify the name, or an :class:`Observation` instance to obtain the metric.</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Added support for `str` in this parameter.</span> |
| <span class="sd"> exprs : :class:`Column`</span> |
| <span class="sd"> column expressions (:class:`Column`).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> the observed :class:`DataFrame`.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> When ``observation`` is :class:`Observation`, this method only supports batch queries.</span> |
| <span class="sd"> When ``observation`` is a string, this method works for both batch and streaming queries.</span> |
| <span class="sd"> Continuous execution is currently not supported yet.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> When ``observation`` is :class:`Observation`, only batch queries work as below.</span> |
| |
| <span class="sd"> >>> from pyspark.sql.functions import col, count, lit, max</span> |
| <span class="sd"> >>> from pyspark.sql import Observation</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| <span class="sd"> >>> observation = Observation("my metrics")</span> |
| <span class="sd"> >>> observed_df = df.observe(observation, count(lit(1)).alias("count"), max(col("age")))</span> |
| <span class="sd"> >>> observed_df.count()</span> |
| <span class="sd"> 2</span> |
| <span class="sd"> >>> observation.get</span> |
| <span class="sd"> {'count': 2, 'max(age)': 5}</span> |
| |
| <span class="sd"> When ``observation`` is a string, streaming queries also work as below.</span> |
| |
| <span class="sd"> >>> from pyspark.sql.streaming import StreamingQueryListener</span> |
| <span class="sd"> >>> import time</span> |
| <span class="sd"> >>> class MyErrorListener(StreamingQueryListener):</span> |
| <span class="sd"> ... def onQueryStarted(self, event):</span> |
| <span class="sd"> ... pass</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> ... def onQueryProgress(self, event):</span> |
| <span class="sd"> ... row = event.progress.observedMetrics.get("my_event")</span> |
| <span class="sd"> ... # Trigger if the number of errors exceeds 5 percent</span> |
| <span class="sd"> ... num_rows = row.rc</span> |
| <span class="sd"> ... num_error_rows = row.erc</span> |
| <span class="sd"> ... ratio = num_error_rows / num_rows</span> |
| <span class="sd"> ... if ratio > 0.05:</span> |
| <span class="sd"> ... # Trigger alert</span> |
| <span class="sd"> ... pass</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> ... def onQueryIdle(self, event):</span> |
| <span class="sd"> ... pass</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> ... def onQueryTerminated(self, event):</span> |
| <span class="sd"> ... pass</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> error_listener = MyErrorListener()</span> |
| <span class="sd"> >>> spark.streams.addListener(error_listener)</span> |
| <span class="sd"> >>> sdf = spark.readStream.format("rate").load().withColumn(</span> |
| <span class="sd"> ... "error", col("value")</span> |
| <span class="sd"> ... )</span> |
| <span class="sd"> >>> # Observe row count (rc) and error row count (erc) in the streaming Dataset</span> |
| <span class="sd"> ... observed_ds = sdf.observe(</span> |
| <span class="sd"> ... "my_event",</span> |
| <span class="sd"> ... count(lit(1)).alias("rc"),</span> |
| <span class="sd"> ... count(col("error")).alias("erc"))</span> |
| <span class="sd"> >>> try:</span> |
| <span class="sd"> ... q = observed_ds.writeStream.format("console").start()</span> |
| <span class="sd"> ... time.sleep(5)</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> ... finally:</span> |
| <span class="sd"> ... q.stop()</span> |
| <span class="sd"> ... spark.streams.removeListener(error_listener)</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.union"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.union.html#pyspark.sql.DataFrame.union">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">union</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"DataFrame"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Return a new :class:`DataFrame` containing the union of rows in this and another</span> |
| <span class="sd"> :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : :class:`DataFrame`</span> |
| <span class="sd"> Another :class:`DataFrame` that needs to be unioned.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> A new :class:`DataFrame` containing the combined rows with corresponding columns.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.unionAll</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This method performs a SQL-style set union of the rows from both `DataFrame` objects,</span> |
| <span class="sd"> with no automatic deduplication of elements.</span> |
| |
| <span class="sd"> Use the `distinct()` method to perform deduplication of rows.</span> |
| |
| <span class="sd"> The method resolves columns by position (not by name), following the standard behavior</span> |
| <span class="sd"> in SQL.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Example 1: Combining two DataFrames with the same schema</span> |
| |
| <span class="sd"> >>> df1 = spark.createDataFrame([(1, 'A'), (2, 'B')], ['id', 'value'])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([(3, 'C'), (4, 'D')], ['id', 'value'])</span> |
| <span class="sd"> >>> df3 = df1.union(df2)</span> |
| <span class="sd"> >>> df3.show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | id|value|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 1| A|</span> |
| <span class="sd"> | 2| B|</span> |
| <span class="sd"> | 3| C|</span> |
| <span class="sd"> | 4| D|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> Example 2: Combining two DataFrames with different schemas</span> |
| |
| <span class="sd"> >>> from pyspark.sql.functions import lit</span> |
| <span class="sd"> >>> df1 = spark.createDataFrame([(100001, 1), (100002, 2)], schema="id LONG, money INT")</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([(3, 100003), (4, 100003)], schema="money INT, id LONG")</span> |
| <span class="sd"> >>> df1 = df1.withColumn("age", lit(30))</span> |
| <span class="sd"> >>> df2 = df2.withColumn("age", lit(40))</span> |
| <span class="sd"> >>> df3 = df1.union(df2)</span> |
| <span class="sd"> >>> df3.show()</span> |
| <span class="sd"> +------+------+---+</span> |
| <span class="sd"> | id| money|age|</span> |
| <span class="sd"> +------+------+---+</span> |
| <span class="sd"> |100001| 1| 30|</span> |
| <span class="sd"> |100002| 2| 30|</span> |
| <span class="sd"> | 3|100003| 40|</span> |
| <span class="sd"> | 4|100003| 40|</span> |
| <span class="sd"> +------+------+---+</span> |
| |
| <span class="sd"> Example 3: Combining two DataFrames with mismatched columns</span> |
| |
| <span class="sd"> >>> df1 = spark.createDataFrame([(1, 2)], ["A", "B"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([(3, 4)], ["C", "D"])</span> |
| <span class="sd"> >>> df3 = df1.union(df2)</span> |
| <span class="sd"> >>> df3.show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | A| B|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | 1| 2|</span> |
| <span class="sd"> | 3| 4|</span> |
| <span class="sd"> +---+---+</span> |
| |
| <span class="sd"> Example 4: Combining duplicate rows from two different DataFrames</span> |
| |
| <span class="sd"> >>> df1 = spark.createDataFrame([(1, 'A'), (2, 'B'), (3, 'C')], ['id', 'value'])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([(3, 'C'), (4, 'D')], ['id', 'value'])</span> |
| <span class="sd"> >>> df3 = df1.union(df2).distinct().sort("id")</span> |
| <span class="sd"> >>> df3.show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | id|value|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 1| A|</span> |
| <span class="sd"> | 2| B|</span> |
| <span class="sd"> | 3| C|</span> |
| <span class="sd"> | 4| D|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.unionAll"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.unionAll.html#pyspark.sql.DataFrame.unionAll">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">unionAll</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"DataFrame"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Return a new :class:`DataFrame` containing the union of rows in this and another</span> |
| <span class="sd"> :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : :class:`DataFrame`</span> |
| <span class="sd"> Another :class:`DataFrame` that needs to be combined</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> A new :class:`DataFrame` containing combined rows from both dataframes.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This method combines all rows from both `DataFrame` objects with no automatic</span> |
| <span class="sd"> deduplication of elements.</span> |
| |
| <span class="sd"> Use the `distinct()` method to perform deduplication of rows.</span> |
| |
| <span class="sd"> :func:`unionAll` is an alias to :func:`union`</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> DataFrame.union</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.unionByName"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.unionByName.html#pyspark.sql.DataFrame.unionByName">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">unionByName</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"DataFrame"</span><span class="p">,</span> <span class="n">allowMissingColumns</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a new :class:`DataFrame` containing union of rows in this and another</span> |
| <span class="sd"> :class:`DataFrame`.</span> |
| |
| <span class="sd"> This method performs a union operation on both input DataFrames, resolving columns by</span> |
| <span class="sd"> name (rather than position). When `allowMissingColumns` is True, missing columns will</span> |
| <span class="sd"> be filled with null.</span> |
| |
| <span class="sd"> .. versionadded:: 2.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : :class:`DataFrame`</span> |
| <span class="sd"> Another :class:`DataFrame` that needs to be combined.</span> |
| <span class="sd"> allowMissingColumns : bool, optional, default False</span> |
| <span class="sd"> Specify whether to allow missing columns.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> A new :class:`DataFrame` containing the combined rows with corresponding</span> |
| <span class="sd"> columns of the two given DataFrames.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Example 1: Union of two DataFrames with same columns in different order.</span> |
| |
| <span class="sd"> >>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col0"])</span> |
| <span class="sd"> >>> df1.unionByName(df2).show()</span> |
| <span class="sd"> +----+----+----+</span> |
| <span class="sd"> |col0|col1|col2|</span> |
| <span class="sd"> +----+----+----+</span> |
| <span class="sd"> | 1| 2| 3|</span> |
| <span class="sd"> | 6| 4| 5|</span> |
| <span class="sd"> +----+----+----+</span> |
| |
| <span class="sd"> Example 2: Union with missing columns and setting `allowMissingColumns=True`.</span> |
| |
| <span class="sd"> >>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([[4, 5, 6]], ["col1", "col2", "col3"])</span> |
| <span class="sd"> >>> df1.unionByName(df2, allowMissingColumns=True).show()</span> |
| <span class="sd"> +----+----+----+----+</span> |
| <span class="sd"> |col0|col1|col2|col3|</span> |
| <span class="sd"> +----+----+----+----+</span> |
| <span class="sd"> | 1| 2| 3|NULL|</span> |
| <span class="sd"> |NULL| 4| 5| 6|</span> |
| <span class="sd"> +----+----+----+----+</span> |
| |
| <span class="sd"> Example 3: Union of two DataFrames with few common columns.</span> |
| |
| <span class="sd"> >>> df1 = spark.createDataFrame([[1, 2, 3]], ["col0", "col1", "col2"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([[4, 5, 6, 7]], ["col1", "col2", "col3", "col4"])</span> |
| <span class="sd"> >>> df1.unionByName(df2, allowMissingColumns=True).show()</span> |
| <span class="sd"> +----+----+----+----+----+</span> |
| <span class="sd"> |col0|col1|col2|col3|col4|</span> |
| <span class="sd"> +----+----+----+----+----+</span> |
| <span class="sd"> | 1| 2| 3|NULL|NULL|</span> |
| <span class="sd"> |NULL| 4| 5| 6| 7|</span> |
| <span class="sd"> +----+----+----+----+----+</span> |
| |
| <span class="sd"> Example 4: Union of two DataFrames with completely different columns.</span> |
| |
| <span class="sd"> >>> df1 = spark.createDataFrame([[0, 1, 2]], ["col0", "col1", "col2"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([[3, 4, 5]], ["col3", "col4", "col5"])</span> |
| <span class="sd"> >>> df1.unionByName(df2, allowMissingColumns=True).show()</span> |
| <span class="sd"> +----+----+----+----+----+----+</span> |
| <span class="sd"> |col0|col1|col2|col3|col4|col5|</span> |
| <span class="sd"> +----+----+----+----+----+----+</span> |
| <span class="sd"> | 0| 1| 2|NULL|NULL|NULL|</span> |
| <span class="sd"> |NULL|NULL|NULL| 3| 4| 5|</span> |
| <span class="sd"> +----+----+----+----+----+----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.intersect"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.intersect.html#pyspark.sql.DataFrame.intersect">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">intersect</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"DataFrame"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Return a new :class:`DataFrame` containing rows only in</span> |
| <span class="sd"> both this :class:`DataFrame` and another :class:`DataFrame`.</span> |
| <span class="sd"> Note that any duplicates are removed. To preserve duplicates</span> |
| <span class="sd"> use :func:`intersectAll`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : :class:`DataFrame`</span> |
| <span class="sd"> Another :class:`DataFrame` that needs to be combined.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Combined DataFrame.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This is equivalent to `INTERSECT` in SQL.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Example 1: Intersecting two DataFrames with the same schema</span> |
| |
| <span class="sd"> >>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])</span> |
| <span class="sd"> >>> result_df = df1.intersect(df2).sort("C1", "C2")</span> |
| <span class="sd"> >>> result_df.show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | C1| C2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | a| 1|</span> |
| <span class="sd"> | b| 3|</span> |
| <span class="sd"> +---+---+</span> |
| |
| <span class="sd"> Example 2: Intersecting two DataFrames with different schemas</span> |
| |
| <span class="sd"> >>> df1 = spark.createDataFrame([(1, "A"), (2, "B")], ["id", "value"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([(2, "B"), (3, "C")], ["id", "value"])</span> |
| <span class="sd"> >>> result_df = df1.intersect(df2).sort("id", "value")</span> |
| <span class="sd"> >>> result_df.show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | id|value|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 2| B|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> Example 3: Intersecting all rows from two DataFrames with mismatched columns</span> |
| |
| <span class="sd"> >>> df1 = spark.createDataFrame([(1, 2), (1, 2), (3, 4)], ["A", "B"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([(1, 2), (1, 2)], ["C", "D"])</span> |
| <span class="sd"> >>> result_df = df1.intersect(df2).sort("A", "B")</span> |
| <span class="sd"> >>> result_df.show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | A| B|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | 1| 2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.intersectAll"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.intersectAll.html#pyspark.sql.DataFrame.intersectAll">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">intersectAll</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"DataFrame"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Return a new :class:`DataFrame` containing rows in both this :class:`DataFrame`</span> |
| <span class="sd"> and another :class:`DataFrame` while preserving duplicates.</span> |
| |
| <span class="sd"> This is equivalent to `INTERSECT ALL` in SQL. As standard in SQL, this function</span> |
| <span class="sd"> resolves columns by position (not by name).</span> |
| |
| <span class="sd"> .. versionadded:: 2.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : :class:`DataFrame`</span> |
| <span class="sd"> Another :class:`DataFrame` that needs to be combined.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Combined DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Example 1: Intersecting two DataFrames with the same schema</span> |
| |
| <span class="sd"> >>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])</span> |
| <span class="sd"> >>> result_df = df1.intersectAll(df2).sort("C1", "C2")</span> |
| <span class="sd"> >>> result_df.show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | C1| C2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | a| 1|</span> |
| <span class="sd"> | a| 1|</span> |
| <span class="sd"> | b| 3|</span> |
| <span class="sd"> +---+---+</span> |
| |
| <span class="sd"> Example 2: Intersecting two DataFrames with different schemas</span> |
| |
| <span class="sd"> >>> df1 = spark.createDataFrame([(1, "A"), (2, "B")], ["id", "value"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([(2, "B"), (3, "C")], ["id", "value"])</span> |
| <span class="sd"> >>> result_df = df1.intersectAll(df2).sort("id", "value")</span> |
| <span class="sd"> >>> result_df.show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | id|value|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 2| B|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> Example 3: Intersecting all rows from two DataFrames with mismatched columns</span> |
| |
| <span class="sd"> >>> df1 = spark.createDataFrame([(1, 2), (1, 2), (3, 4)], ["A", "B"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([(1, 2), (1, 2)], ["C", "D"])</span> |
| <span class="sd"> >>> result_df = df1.intersectAll(df2).sort("A", "B")</span> |
| <span class="sd"> >>> result_df.show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | A| B|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | 1| 2|</span> |
| <span class="sd"> | 1| 2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.subtract"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.subtract.html#pyspark.sql.DataFrame.subtract">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">subtract</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"DataFrame"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Return a new :class:`DataFrame` containing rows in this :class:`DataFrame`</span> |
| <span class="sd"> but not in another :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : :class:`DataFrame`</span> |
| <span class="sd"> Another :class:`DataFrame` that needs to be subtracted.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Subtracted DataFrame.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This is equivalent to `EXCEPT DISTINCT` in SQL.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Example 1: Subtracting two DataFrames with the same schema</span> |
| |
| <span class="sd"> >>> df1 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3), ("c", 4)], ["C1", "C2"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([("a", 1), ("a", 1), ("b", 3)], ["C1", "C2"])</span> |
| <span class="sd"> >>> result_df = df1.subtract(df2)</span> |
| <span class="sd"> >>> result_df.show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | C1| C2|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | c| 4|</span> |
| <span class="sd"> +---+---+</span> |
| |
| <span class="sd"> Example 2: Subtracting two DataFrames with different schemas</span> |
| |
| <span class="sd"> >>> df1 = spark.createDataFrame([(1, "A"), (2, "B")], ["id", "value"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([(2, "B"), (3, "C")], ["id", "value"])</span> |
| <span class="sd"> >>> result_df = df1.subtract(df2)</span> |
| <span class="sd"> >>> result_df.show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | id|value|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 1| A|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> Example 3: Subtracting two DataFrames with mismatched columns</span> |
| |
| <span class="sd"> >>> df1 = spark.createDataFrame([(1, 2)], ["A", "B"])</span> |
| <span class="sd"> >>> df2 = spark.createDataFrame([(1, 2)], ["C", "D"])</span> |
| <span class="sd"> >>> result_df = df1.subtract(df2)</span> |
| <span class="sd"> >>> result_df.show()</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | A| B|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.dropDuplicates"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.dropDuplicates.html#pyspark.sql.DataFrame.dropDuplicates">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">dropDuplicates</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Return a new :class:`DataFrame` with duplicate rows removed,</span> |
| <span class="sd"> optionally only considering certain columns.</span> |
| |
| <span class="sd"> For a static batch :class:`DataFrame`, it just drops duplicate rows. For a streaming</span> |
| <span class="sd"> :class:`DataFrame`, it will keep all data across triggers as intermediate state to drop</span> |
| <span class="sd"> duplicates rows. You can use :func:`withWatermark` to limit how late the duplicate data can</span> |
| <span class="sd"> be and the system will accordingly limit the state. In addition, data older than</span> |
| <span class="sd"> watermark will be dropped to avoid any possibility of duplicates.</span> |
| |
| <span class="sd"> :func:`drop_duplicates` is an alias for :func:`dropDuplicates`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> subset : list of column names, optional</span> |
| <span class="sd"> List of columns to use for duplicate comparison (default All columns).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> DataFrame without duplicates.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Row</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... Row(name='Alice', age=5, height=80),</span> |
| <span class="sd"> ... Row(name='Alice', age=5, height=80),</span> |
| <span class="sd"> ... Row(name='Alice', age=10, height=80)</span> |
| <span class="sd"> ... ])</span> |
| |
| <span class="sd"> Deduplicate the same rows.</span> |
| |
| <span class="sd"> >>> df.dropDuplicates().show()</span> |
| <span class="sd"> +-----+---+------+</span> |
| <span class="sd"> | name|age|height|</span> |
| <span class="sd"> +-----+---+------+</span> |
| <span class="sd"> |Alice| 5| 80|</span> |
| <span class="sd"> |Alice| 10| 80|</span> |
| <span class="sd"> +-----+---+------+</span> |
| |
| <span class="sd"> Deduplicate values on 'name' and 'height' columns.</span> |
| |
| <span class="sd"> >>> df.dropDuplicates(['name', 'height']).show()</span> |
| <span class="sd"> +-----+---+------+</span> |
| <span class="sd"> | name|age|height|</span> |
| <span class="sd"> +-----+---+------+</span> |
| <span class="sd"> |Alice| 5| 80|</span> |
| <span class="sd"> +-----+---+------+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.dropDuplicatesWithinWatermark"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.dropDuplicatesWithinWatermark.html#pyspark.sql.DataFrame.dropDuplicatesWithinWatermark">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">dropDuplicatesWithinWatermark</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Return a new :class:`DataFrame` with duplicate rows removed,</span> |
| <span class="sd"> optionally only considering certain columns, within watermark.</span> |
| |
| <span class="sd"> This only works with streaming :class:`DataFrame`, and watermark for the input</span> |
| <span class="sd"> :class:`DataFrame` must be set via :func:`withWatermark`.</span> |
| |
| <span class="sd"> For a streaming :class:`DataFrame`, this will keep all data across triggers as intermediate</span> |
| <span class="sd"> state to drop duplicated rows. The state will be kept to guarantee the semantic, "Events</span> |
| <span class="sd"> are deduplicated as long as the time distance of earliest and latest events are smaller</span> |
| <span class="sd"> than the delay threshold of watermark." Users are encouraged to set the delay threshold of</span> |
| <span class="sd"> watermark longer than max timestamp differences among duplicated events.</span> |
| |
| <span class="sd"> Note: too late data older than watermark will be dropped.</span> |
| |
| <span class="sd"> .. versionadded:: 3.5.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> subset : List of column names, optional</span> |
| <span class="sd"> List of columns to use for duplicate comparison (default All columns).</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> DataFrame without duplicates.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Row</span> |
| <span class="sd"> >>> from pyspark.sql.functions import timestamp_seconds</span> |
| <span class="sd"> >>> df = spark.readStream.format("rate").load().selectExpr(</span> |
| <span class="sd"> ... "value % 5 AS value", "timestamp")</span> |
| <span class="sd"> >>> df.select("value", df.timestamp.alias("time")).withWatermark("time", '10 minutes')</span> |
| <span class="sd"> DataFrame[value: bigint, time: timestamp]</span> |
| |
| <span class="sd"> Deduplicate the same rows.</span> |
| |
| <span class="sd"> >>> df.dropDuplicatesWithinWatermark() # doctest: +SKIP</span> |
| |
| <span class="sd"> Deduplicate values on 'value' columns.</span> |
| |
| <span class="sd"> >>> df.dropDuplicatesWithinWatermark(['value']) # doctest: +SKIP</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.dropna"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.dropna.html#pyspark.sql.DataFrame.dropna">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">dropna</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">how</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"any"</span><span class="p">,</span> |
| <span class="n">thresh</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="o">...</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a new :class:`DataFrame` omitting rows with null values.</span> |
| <span class="sd"> :func:`DataFrame.dropna` and :func:`DataFrameNaFunctions.drop` are</span> |
| <span class="sd"> aliases of each other.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.1</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> how : str, optional, the values that can be 'any' or 'all', default 'any'.</span> |
| <span class="sd"> If 'any', drop a row if it contains any nulls.</span> |
| <span class="sd"> If 'all', drop a row only if all its values are null.</span> |
| <span class="sd"> thresh: int, optional, default None.</span> |
| <span class="sd"> If specified, drop rows that have less than `thresh` non-null values.</span> |
| <span class="sd"> This overwrites the `how` parameter.</span> |
| <span class="sd"> subset : str, tuple or list, optional</span> |
| <span class="sd"> optional list of column names to consider.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> DataFrame with null only rows excluded.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql import Row</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... Row(age=10, height=80, name="Alice"),</span> |
| <span class="sd"> ... Row(age=5, height=None, name="Bob"),</span> |
| <span class="sd"> ... Row(age=None, height=None, name="Tom"),</span> |
| <span class="sd"> ... Row(age=None, height=None, name=None),</span> |
| <span class="sd"> ... ])</span> |
| |
| <span class="sd"> Example 1: Drop the row if it contains any nulls.</span> |
| |
| <span class="sd"> >>> df.na.drop().show()</span> |
| <span class="sd"> +---+------+-----+</span> |
| <span class="sd"> |age|height| name|</span> |
| <span class="sd"> +---+------+-----+</span> |
| <span class="sd"> | 10| 80|Alice|</span> |
| <span class="sd"> +---+------+-----+</span> |
| |
| <span class="sd"> Example 2: Drop the row only if all its values are null.</span> |
| |
| <span class="sd"> >>> df.na.drop(how='all').show()</span> |
| <span class="sd"> +----+------+-----+</span> |
| <span class="sd"> | age|height| name|</span> |
| <span class="sd"> +----+------+-----+</span> |
| <span class="sd"> | 10| 80|Alice|</span> |
| <span class="sd"> | 5| NULL| Bob|</span> |
| <span class="sd"> |NULL| NULL| Tom|</span> |
| <span class="sd"> +----+------+-----+</span> |
| |
| <span class="sd"> Example 3: Drop rows that have less than `thresh` non-null values.</span> |
| |
| <span class="sd"> >>> df.na.drop(thresh=2).show()</span> |
| <span class="sd"> +---+------+-----+</span> |
| <span class="sd"> |age|height| name|</span> |
| <span class="sd"> +---+------+-----+</span> |
| <span class="sd"> | 10| 80|Alice|</span> |
| <span class="sd"> | 5| NULL| Bob|</span> |
| <span class="sd"> +---+------+-----+</span> |
| |
| <span class="sd"> Example 4: Drop rows with non-null values in the specified columns.</span> |
| |
| <span class="sd"> >>> df.na.drop(subset=['age', 'name']).show()</span> |
| <span class="sd"> +---+------+-----+</span> |
| <span class="sd"> |age|height| name|</span> |
| <span class="sd"> +---+------+-----+</span> |
| <span class="sd"> | 10| 80|Alice|</span> |
| <span class="sd"> | 5| NULL| Bob|</span> |
| <span class="sd"> +---+------+-----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">fillna</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">value</span><span class="p">:</span> <span class="s2">"LiteralType"</span><span class="p">,</span> |
| <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="o">...</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">fillna</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="s2">"LiteralType"</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.fillna"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.fillna.html#pyspark.sql.DataFrame.fillna">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">fillna</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">value</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"LiteralType"</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="s2">"LiteralType"</span><span class="p">]],</span> |
| <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="o">...</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a new :class:`DataFrame` which null values are filled with new value.</span> |
| <span class="sd"> :func:`DataFrame.fillna` and :func:`DataFrameNaFunctions.fill` are</span> |
| <span class="sd"> aliases of each other.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.1</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> value : int, float, string, bool or dict, the value to replace null values with.</span> |
| <span class="sd"> If the value is a dict, then `subset` is ignored and `value` must be a mapping</span> |
| <span class="sd"> from column name (string) to replacement value. The replacement value must be</span> |
| <span class="sd"> an int, float, boolean, or string.</span> |
| <span class="sd"> subset : str, tuple or list, optional</span> |
| <span class="sd"> optional list of column names to consider.</span> |
| <span class="sd"> Columns specified in subset that do not have matching data types are ignored.</span> |
| <span class="sd"> For example, if `value` is a string, and subset contains a non-string column,</span> |
| <span class="sd"> then the non-string column is simply ignored.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> DataFrame with replaced null values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... (10, 80.5, "Alice", None),</span> |
| <span class="sd"> ... (5, None, "Bob", None),</span> |
| <span class="sd"> ... (None, None, "Tom", None),</span> |
| <span class="sd"> ... (None, None, None, True)],</span> |
| <span class="sd"> ... schema=["age", "height", "name", "bool"])</span> |
| |
| <span class="sd"> Example 1: Fill all null values with 50 for numeric columns.</span> |
| |
| <span class="sd"> >>> df.na.fill(50).show()</span> |
| <span class="sd"> +---+------+-----+----+</span> |
| <span class="sd"> |age|height| name|bool|</span> |
| <span class="sd"> +---+------+-----+----+</span> |
| <span class="sd"> | 10| 80.5|Alice|NULL|</span> |
| <span class="sd"> | 5| 50.0| Bob|NULL|</span> |
| <span class="sd"> | 50| 50.0| Tom|NULL|</span> |
| <span class="sd"> | 50| 50.0| NULL|true|</span> |
| <span class="sd"> +---+------+-----+----+</span> |
| |
| <span class="sd"> Example 2: Fill all null values with ``False`` for boolean columns.</span> |
| |
| <span class="sd"> >>> df.na.fill(False).show()</span> |
| <span class="sd"> +----+------+-----+-----+</span> |
| <span class="sd"> | age|height| name| bool|</span> |
| <span class="sd"> +----+------+-----+-----+</span> |
| <span class="sd"> | 10| 80.5|Alice|false|</span> |
| <span class="sd"> | 5| NULL| Bob|false|</span> |
| <span class="sd"> |NULL| NULL| Tom|false|</span> |
| <span class="sd"> |NULL| NULL| NULL| true|</span> |
| <span class="sd"> +----+------+-----+-----+</span> |
| |
| <span class="sd"> Example 3: Fill all null values with to 50 and "unknown" for</span> |
| <span class="sd"> 'age' and 'name' column respectively.</span> |
| |
| <span class="sd"> >>> df.na.fill({'age': 50, 'name': 'unknown'}).show()</span> |
| <span class="sd"> +---+------+-------+----+</span> |
| <span class="sd"> |age|height| name|bool|</span> |
| <span class="sd"> +---+------+-------+----+</span> |
| <span class="sd"> | 10| 80.5| Alice|NULL|</span> |
| <span class="sd"> | 5| NULL| Bob|NULL|</span> |
| <span class="sd"> | 50| NULL| Tom|NULL|</span> |
| <span class="sd"> | 50| NULL|unknown|true|</span> |
| <span class="sd"> +---+------+-------+----+</span> |
| |
| <span class="sd"> Example 4: Fill all null values with "Spark" for 'name' column.</span> |
| |
| <span class="sd"> >>> df.na.fill(value = 'Spark', subset = 'name').show()</span> |
| <span class="sd"> +----+------+-----+----+</span> |
| <span class="sd"> | age|height| name|bool|</span> |
| <span class="sd"> +----+------+-----+----+</span> |
| <span class="sd"> | 10| 80.5|Alice|NULL|</span> |
| <span class="sd"> | 5| NULL| Bob|NULL|</span> |
| <span class="sd"> |NULL| NULL| Tom|NULL|</span> |
| <span class="sd"> |NULL| NULL|Spark|true|</span> |
| <span class="sd"> +----+------+-----+----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">replace</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">to_replace</span><span class="p">:</span> <span class="s2">"LiteralType"</span><span class="p">,</span> |
| <span class="n">value</span><span class="p">:</span> <span class="s2">"OptionalPrimitiveType"</span><span class="p">,</span> |
| <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">replace</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">to_replace</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">"LiteralType"</span><span class="p">],</span> |
| <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">"OptionalPrimitiveType"</span><span class="p">],</span> |
| <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">replace</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">to_replace</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="s2">"LiteralType"</span><span class="p">,</span> <span class="s2">"OptionalPrimitiveType"</span><span class="p">],</span> |
| <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">replace</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">to_replace</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">"LiteralType"</span><span class="p">],</span> |
| <span class="n">value</span><span class="p">:</span> <span class="s2">"OptionalPrimitiveType"</span><span class="p">,</span> |
| <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.replace"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.replace.html#pyspark.sql.DataFrame.replace">[docs]</a> <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="k">def</span> <span class="nf">replace</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">to_replace</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span> |
| <span class="s2">"LiteralType"</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="s2">"LiteralType"</span><span class="p">],</span> <span class="n">Dict</span><span class="p">[</span><span class="s2">"LiteralType"</span><span class="p">,</span> <span class="s2">"OptionalPrimitiveType"</span><span class="p">]</span> |
| <span class="p">],</span> |
| <span class="n">value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span> |
| <span class="n">Union</span><span class="p">[</span><span class="s2">"OptionalPrimitiveType"</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="s2">"OptionalPrimitiveType"</span><span class="p">],</span> <span class="n">_NoValueType</span><span class="p">]</span> |
| <span class="p">]</span> <span class="o">=</span> <span class="n">_NoValue</span><span class="p">,</span> |
| <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a new :class:`DataFrame` replacing a value with another value.</span> |
| <span class="sd"> :func:`DataFrame.replace` and :func:`DataFrameNaFunctions.replace` are</span> |
| <span class="sd"> aliases of each other.</span> |
| <span class="sd"> Values to_replace and value must have the same type and can only be numerics, booleans,</span> |
| <span class="sd"> or strings. Value can have None. When replacing, the new value will be cast</span> |
| <span class="sd"> to the type of the existing column.</span> |
| <span class="sd"> For numeric replacements all values to be replaced should have unique</span> |
| <span class="sd"> floating point representation. In case of conflicts (for example with `{42: -1, 42.0: 1}`)</span> |
| <span class="sd"> and arbitrary replacement will be used.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> to_replace : bool, int, float, string, list or dict, the value to be replaced.</span> |
| <span class="sd"> If the value is a dict, then `value` is ignored or can be omitted, and `to_replace`</span> |
| <span class="sd"> must be a mapping between a value and a replacement.</span> |
| <span class="sd"> value : bool, int, float, string or None, optional</span> |
| <span class="sd"> The replacement value must be a bool, int, float, string or None. If `value` is a</span> |
| <span class="sd"> list, `value` should be of the same length and type as `to_replace`.</span> |
| <span class="sd"> If `value` is a scalar and `to_replace` is a sequence, then `value` is</span> |
| <span class="sd"> used as a replacement for each item in `to_replace`.</span> |
| <span class="sd"> subset : list, optional</span> |
| <span class="sd"> optional list of column names to consider.</span> |
| <span class="sd"> Columns specified in subset that do not have matching data types are ignored.</span> |
| <span class="sd"> For example, if `value` is a string, and subset contains a non-string column,</span> |
| <span class="sd"> then the non-string column is simply ignored.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> DataFrame with replaced values.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([</span> |
| <span class="sd"> ... (10, 80, "Alice"),</span> |
| <span class="sd"> ... (5, None, "Bob"),</span> |
| <span class="sd"> ... (None, 10, "Tom"),</span> |
| <span class="sd"> ... (None, None, None)],</span> |
| <span class="sd"> ... schema=["age", "height", "name"])</span> |
| |
| <span class="sd"> Example 1: Replace 10 to 20 in all columns.</span> |
| |
| <span class="sd"> >>> df.na.replace(10, 20).show()</span> |
| <span class="sd"> +----+------+-----+</span> |
| <span class="sd"> | age|height| name|</span> |
| <span class="sd"> +----+------+-----+</span> |
| <span class="sd"> | 20| 80|Alice|</span> |
| <span class="sd"> | 5| NULL| Bob|</span> |
| <span class="sd"> |NULL| 20| Tom|</span> |
| <span class="sd"> |NULL| NULL| NULL|</span> |
| <span class="sd"> +----+------+-----+</span> |
| |
| <span class="sd"> Example 2: Replace 'Alice' to null in all columns.</span> |
| |
| <span class="sd"> >>> df.na.replace('Alice', None).show()</span> |
| <span class="sd"> +----+------+----+</span> |
| <span class="sd"> | age|height|name|</span> |
| <span class="sd"> +----+------+----+</span> |
| <span class="sd"> | 10| 80|NULL|</span> |
| <span class="sd"> | 5| NULL| Bob|</span> |
| <span class="sd"> |NULL| 10| Tom|</span> |
| <span class="sd"> |NULL| NULL|NULL|</span> |
| <span class="sd"> +----+------+----+</span> |
| |
| <span class="sd"> Example 3: Replace 'Alice' to 'A', and 'Bob' to 'B' in the 'name' column.</span> |
| |
| <span class="sd"> >>> df.na.replace(['Alice', 'Bob'], ['A', 'B'], 'name').show()</span> |
| <span class="sd"> +----+------+----+</span> |
| <span class="sd"> | age|height|name|</span> |
| <span class="sd"> +----+------+----+</span> |
| <span class="sd"> | 10| 80| A|</span> |
| <span class="sd"> | 5| NULL| B|</span> |
| <span class="sd"> |NULL| 10| Tom|</span> |
| <span class="sd"> |NULL| NULL|NULL|</span> |
| <span class="sd"> +----+------+----+</span> |
| |
| <span class="sd"> Example 4: Replace 10 to 20 in the 'name' column.</span> |
| |
| <span class="sd"> >>> df.na.replace(10, 18, 'age').show()</span> |
| <span class="sd"> +----+------+-----+</span> |
| <span class="sd"> | age|height| name|</span> |
| <span class="sd"> +----+------+-----+</span> |
| <span class="sd"> | 18| 80|Alice|</span> |
| <span class="sd"> | 5| NULL| Bob|</span> |
| <span class="sd"> |NULL| 10| Tom|</span> |
| <span class="sd"> |NULL| NULL| NULL|</span> |
| <span class="sd"> +----+------+-----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">approxQuantile</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">col</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="n">probabilities</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">float</span><span class="p">]],</span> |
| <span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">approxQuantile</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">col</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">]],</span> |
| <span class="n">probabilities</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">float</span><span class="p">]],</span> |
| <span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.approxQuantile"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.approxQuantile.html#pyspark.sql.DataFrame.approxQuantile">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">approxQuantile</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">col</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">]],</span> |
| <span class="n">probabilities</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">float</span><span class="p">]],</span> |
| <span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Calculates the approximate quantiles of numerical columns of a</span> |
| <span class="sd"> :class:`DataFrame`.</span> |
| |
| <span class="sd"> The result of this algorithm has the following deterministic bound:</span> |
| <span class="sd"> If the :class:`DataFrame` has N elements and if we request the quantile at</span> |
| <span class="sd"> probability `p` up to error `err`, then the algorithm will return</span> |
| <span class="sd"> a sample `x` from the :class:`DataFrame` so that the *exact* rank of `x` is</span> |
| <span class="sd"> close to (p * N). More precisely,</span> |
| |
| <span class="sd"> floor((p - err) * N) <= rank(x) <= ceil((p + err) * N).</span> |
| |
| <span class="sd"> This method implements a variation of the Greenwald-Khanna</span> |
| <span class="sd"> algorithm (with some speed optimizations). The algorithm was first</span> |
| <span class="sd"> present in [[https://doi.org/10.1145/375663.375670</span> |
| <span class="sd"> Space-efficient Online Computation of Quantile Summaries]]</span> |
| <span class="sd"> by Greenwald and Khanna.</span> |
| |
| <span class="sd"> .. versionadded:: 2.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col: str, tuple or list</span> |
| <span class="sd"> Can be a single column name, or a list of names for multiple columns.</span> |
| |
| <span class="sd"> .. versionchanged:: 2.2.0</span> |
| <span class="sd"> Added support for multiple columns.</span> |
| <span class="sd"> probabilities : list or tuple of floats</span> |
| <span class="sd"> a list of quantile probabilities</span> |
| <span class="sd"> Each number must be a float in the range [0, 1].</span> |
| <span class="sd"> For example 0.0 is the minimum, 0.5 is the median, 1.0 is the maximum.</span> |
| <span class="sd"> relativeError : float</span> |
| <span class="sd"> The relative target precision to achieve</span> |
| <span class="sd"> (>= 0). If set to zero, the exact quantiles are computed, which</span> |
| <span class="sd"> could be very expensive. Note that values greater than 1 are</span> |
| <span class="sd"> accepted but gives the same result as 1.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> list</span> |
| <span class="sd"> the approximate quantiles at the given probabilities.</span> |
| |
| <span class="sd"> * If the input `col` is a string, the output is a list of floats.</span> |
| |
| <span class="sd"> * If the input `col` is a list or tuple of strings, the output is also a</span> |
| <span class="sd"> list, but each element in it is a list of floats, i.e., the output</span> |
| <span class="sd"> is a list of list of floats.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Null values will be ignored in numerical columns before calculation.</span> |
| <span class="sd"> For columns only containing null values, an empty list is returned.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Example 1: Calculating quantiles for a single column</span> |
| |
| <span class="sd"> >>> data = [(1,), (2,), (3,), (4,), (5,)]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ["values"])</span> |
| <span class="sd"> >>> quantiles = df.approxQuantile("values", [0.0, 0.5, 1.0], 0.05)</span> |
| <span class="sd"> >>> quantiles</span> |
| <span class="sd"> [1.0, 3.0, 5.0]</span> |
| |
| <span class="sd"> Example 2: Calculating quantiles for multiple columns</span> |
| |
| <span class="sd"> >>> data = [(1, 10), (2, 20), (3, 30), (4, 40), (5, 50)]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ["col1", "col2"])</span> |
| <span class="sd"> >>> quantiles = df.approxQuantile(["col1", "col2"], [0.0, 0.5, 1.0], 0.05)</span> |
| <span class="sd"> >>> quantiles</span> |
| <span class="sd"> [[1.0, 3.0, 5.0], [10.0, 30.0, 50.0]]</span> |
| |
| <span class="sd"> Example 3: Handling null values</span> |
| |
| <span class="sd"> >>> data = [(1,), (None,), (3,), (4,), (None,)]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ["values"])</span> |
| <span class="sd"> >>> quantiles = df.approxQuantile("values", [0.0, 0.5, 1.0], 0.05)</span> |
| <span class="sd"> >>> quantiles</span> |
| <span class="sd"> [1.0, 3.0, 4.0]</span> |
| |
| <span class="sd"> Example 4: Calculating quantiles with low precision</span> |
| |
| <span class="sd"> >>> data = [(1,), (2,), (3,), (4,), (5,)]</span> |
| <span class="sd"> >>> df = spark.createDataFrame(data, ["values"])</span> |
| <span class="sd"> >>> quantiles = df.approxQuantile("values", [0.0, 0.2, 1.0], 0.1)</span> |
| <span class="sd"> >>> quantiles</span> |
| <span class="sd"> [1.0, 1.0, 5.0]</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.corr"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.corr.html#pyspark.sql.DataFrame.corr">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">corr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col1</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">method</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Calculates the correlation of two columns of a :class:`DataFrame` as a double value.</span> |
| <span class="sd"> Currently only supports the Pearson Correlation Coefficient.</span> |
| <span class="sd"> :func:`DataFrame.corr` and :func:`DataFrameStatFunctions.corr` are aliases of each other.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : str</span> |
| <span class="sd"> The name of the first column</span> |
| <span class="sd"> col2 : str</span> |
| <span class="sd"> The name of the second column</span> |
| <span class="sd"> method : str, optional</span> |
| <span class="sd"> The correlation method. Currently only supports "pearson"</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> float</span> |
| <span class="sd"> Pearson Correlation Coefficient of two columns.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, 12), (10, 1), (19, 8)], ["c1", "c2"])</span> |
| <span class="sd"> >>> df.corr("c1", "c2")</span> |
| <span class="sd"> -0.3592106040535498</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(11, 12), (10, 11), (9, 10)], ["small", "bigger"])</span> |
| <span class="sd"> >>> df.corr("small", "bigger")</span> |
| <span class="sd"> 1.0</span> |
| |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.cov"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.cov.html#pyspark.sql.DataFrame.cov">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">cov</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col1</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Calculate the sample covariance for the given columns, specified by their names, as a</span> |
| <span class="sd"> double value. :func:`DataFrame.cov` and :func:`DataFrameStatFunctions.cov` are aliases.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : str</span> |
| <span class="sd"> The name of the first column</span> |
| <span class="sd"> col2 : str</span> |
| <span class="sd"> The name of the second column</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> float</span> |
| <span class="sd"> Covariance of two columns.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, 12), (10, 1), (19, 8)], ["c1", "c2"])</span> |
| <span class="sd"> >>> df.cov("c1", "c2")</span> |
| <span class="sd"> -18.0</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(11, 12), (10, 11), (9, 10)], ["small", "bigger"])</span> |
| <span class="sd"> >>> df.cov("small", "bigger")</span> |
| <span class="sd"> 1.0</span> |
| |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.crosstab"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.crosstab.html#pyspark.sql.DataFrame.crosstab">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">crosstab</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col1</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Computes a pair-wise frequency table of the given columns. Also known as a contingency</span> |
| <span class="sd"> table.</span> |
| <span class="sd"> The first column of each row will be the distinct values of `col1` and the column names</span> |
| <span class="sd"> will be the distinct values of `col2`. The name of the first column will be `$col1_$col2`.</span> |
| <span class="sd"> Pairs that have no occurrences will have zero as their counts.</span> |
| <span class="sd"> :func:`DataFrame.crosstab` and :func:`DataFrameStatFunctions.crosstab` are aliases.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> col1 : str</span> |
| <span class="sd"> The name of the first column. Distinct items will make the first item of</span> |
| <span class="sd"> each row.</span> |
| <span class="sd"> col2 : str</span> |
| <span class="sd"> The name of the second column. Distinct items will make the column names</span> |
| <span class="sd"> of the :class:`DataFrame`.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Frequency matrix of two columns.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, 11), (1, 11), (3, 10), (4, 8), (4, 8)], ["c1", "c2"])</span> |
| <span class="sd"> >>> df.crosstab("c1", "c2").sort("c1_c2").show()</span> |
| <span class="sd"> +-----+---+---+---+</span> |
| <span class="sd"> |c1_c2| 10| 11| 8|</span> |
| <span class="sd"> +-----+---+---+---+</span> |
| <span class="sd"> | 1| 0| 2| 0|</span> |
| <span class="sd"> | 3| 1| 0| 0|</span> |
| <span class="sd"> | 4| 0| 0| 2|</span> |
| <span class="sd"> +-----+---+---+---+</span> |
| |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.freqItems"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.freqItems.html#pyspark.sql.DataFrame.freqItems">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">freqItems</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">]],</span> <span class="n">support</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Finding frequent items for columns, possibly with false positives. Using the</span> |
| <span class="sd"> frequent element count algorithm described in</span> |
| <span class="sd"> "https://doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou".</span> |
| <span class="sd"> :func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols : list or tuple</span> |
| <span class="sd"> Names of the columns to calculate frequent items for as a list or tuple of</span> |
| <span class="sd"> strings.</span> |
| <span class="sd"> support : float, optional</span> |
| <span class="sd"> The frequency with which to consider an item 'frequent'. Default is 1%.</span> |
| <span class="sd"> The support must be greater than 1e-4.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> DataFrame with frequent items.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This function is meant for exploratory data analysis, as we make no</span> |
| <span class="sd"> guarantee about the backward compatibility of the schema of the resulting</span> |
| <span class="sd"> :class:`DataFrame`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, 11), (1, 11), (3, 10), (4, 8), (4, 8)], ["c1", "c2"])</span> |
| <span class="sd"> >>> df.freqItems(["c1", "c2"]).show() # doctest: +SKIP</span> |
| <span class="sd"> +------------+------------+</span> |
| <span class="sd"> |c1_freqItems|c2_freqItems|</span> |
| <span class="sd"> +------------+------------+</span> |
| <span class="sd"> | [4, 1, 3]| [8, 11, 10]|</span> |
| <span class="sd"> +------------+------------+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">_ipython_key_completions_</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""Returns the names of columns in this :class:`DataFrame`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> df._ipython_key_completions_()</span> |
| <span class="sd"> ['age', 'name']</span> |
| |
| <span class="sd"> Would return illegal identifiers.</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], ["age 1", "name?1"])</span> |
| <span class="sd"> >>> df._ipython_key_completions_()</span> |
| <span class="sd"> ['age 1', 'name?1']</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.withColumns"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumns.html#pyspark.sql.DataFrame.withColumns">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">withColumns</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">colsMap</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Column</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a new :class:`DataFrame` by adding multiple columns or replacing the</span> |
| <span class="sd"> existing columns that have the same names.</span> |
| |
| <span class="sd"> The colsMap is a map of column name and column, the column must only refer to attributes</span> |
| <span class="sd"> supplied by this Dataset. It is an error to add columns that refer to some other Dataset.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| <span class="sd"> Added support for multiple columns adding</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> colsMap : dict</span> |
| <span class="sd"> a dict of column name and :class:`Column`. Currently, only a single map is supported.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> DataFrame with new or replaced columns.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| <span class="sd"> >>> df.withColumns({'age2': df.age + 2, 'age3': df.age + 3}).show()</span> |
| <span class="sd"> +---+-----+----+----+</span> |
| <span class="sd"> |age| name|age2|age3|</span> |
| <span class="sd"> +---+-----+----+----+</span> |
| <span class="sd"> | 2|Alice| 4| 5|</span> |
| <span class="sd"> | 5| Bob| 7| 8|</span> |
| <span class="sd"> +---+-----+----+----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.withColumn"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumn.html#pyspark.sql.DataFrame.withColumn">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">withColumn</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">colName</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a new :class:`DataFrame` by adding a column or replacing the</span> |
| <span class="sd"> existing column that has the same name.</span> |
| |
| <span class="sd"> The column expression must be an expression over this :class:`DataFrame`; attempting to add</span> |
| <span class="sd"> a column from some other :class:`DataFrame` will raise an error.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> colName : str</span> |
| <span class="sd"> string, name of the new column.</span> |
| <span class="sd"> col : :class:`Column`</span> |
| <span class="sd"> a :class:`Column` expression for the new column.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> DataFrame with new or replaced column.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This method introduces a projection internally. Therefore, calling it multiple</span> |
| <span class="sd"> times, for instance, via loops in order to add multiple columns can generate big</span> |
| <span class="sd"> plans which can cause performance issues and even `StackOverflowException`.</span> |
| <span class="sd"> To avoid this, use :func:`select` with multiple columns at once.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| <span class="sd"> >>> df.withColumn('age2', df.age + 2).show()</span> |
| <span class="sd"> +---+-----+----+</span> |
| <span class="sd"> |age| name|age2|</span> |
| <span class="sd"> +---+-----+----+</span> |
| <span class="sd"> | 2|Alice| 4|</span> |
| <span class="sd"> | 5| Bob| 7|</span> |
| <span class="sd"> +---+-----+----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.withColumnRenamed"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumnRenamed.html#pyspark.sql.DataFrame.withColumnRenamed">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">withColumnRenamed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">existing</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">new</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a new :class:`DataFrame` by renaming an existing column.</span> |
| <span class="sd"> This is a no-op if the schema doesn't contain the given column name.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> existing : str</span> |
| <span class="sd"> The name of the existing column to be renamed.</span> |
| <span class="sd"> new : str</span> |
| <span class="sd"> The new name to be assigned to the column.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> A new DataFrame with renamed column.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> :meth:`withColumnsRenamed`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| |
| <span class="sd"> Example 1: Rename a single column</span> |
| |
| <span class="sd"> >>> df.withColumnRenamed("age", "age2").show()</span> |
| <span class="sd"> +----+-----+</span> |
| <span class="sd"> |age2| name|</span> |
| <span class="sd"> +----+-----+</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +----+-----+</span> |
| |
| <span class="sd"> Example 2: Rename a column that does not exist (no-op)</span> |
| |
| <span class="sd"> >>> df.withColumnRenamed("non_existing", "new_name").show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> Example 3: Rename multiple columns</span> |
| |
| <span class="sd"> >>> df.withColumnRenamed("age", "age2").withColumnRenamed("name", "name2").show()</span> |
| <span class="sd"> +----+-----+</span> |
| <span class="sd"> |age2|name2|</span> |
| <span class="sd"> +----+-----+</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +----+-----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.withColumnsRenamed"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.withColumnsRenamed.html#pyspark.sql.DataFrame.withColumnsRenamed">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">withColumnsRenamed</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">colsMap</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="nb">str</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a new :class:`DataFrame` by renaming multiple columns.</span> |
| <span class="sd"> This is a no-op if the schema doesn't contain the given column names.</span> |
| |
| <span class="sd"> .. versionadded:: 3.4.0</span> |
| <span class="sd"> Added support for multiple columns renaming</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> colsMap : dict</span> |
| <span class="sd"> A dict of existing column names and corresponding desired column names.</span> |
| <span class="sd"> Currently, only a single map is supported.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> DataFrame with renamed columns.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> :meth:`withColumnRenamed`</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| |
| <span class="sd"> Example 1: Rename a single column</span> |
| |
| <span class="sd"> >>> df.withColumnsRenamed({"age": "age2"}).show()</span> |
| <span class="sd"> +----+-----+</span> |
| <span class="sd"> |age2| name|</span> |
| <span class="sd"> +----+-----+</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +----+-----+</span> |
| |
| <span class="sd"> Example 2: Rename multiple columns</span> |
| |
| <span class="sd"> >>> df.withColumnsRenamed({"age": "age2", "name": "name2"}).show()</span> |
| <span class="sd"> +----+-----+</span> |
| <span class="sd"> |age2|name2|</span> |
| <span class="sd"> +----+-----+</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +----+-----+</span> |
| |
| <span class="sd"> Example 3: Rename non-existing column (no-op)</span> |
| |
| <span class="sd"> >>> df.withColumnsRenamed({"non_existing": "new_name"}).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> Example 4: Rename with an empty dictionary (no-op)</span> |
| |
| <span class="sd"> >>> df.withColumnsRenamed({}).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 2|Alice|</span> |
| <span class="sd"> | 5| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.withMetadata"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.withMetadata.html#pyspark.sql.DataFrame.withMetadata">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">withMetadata</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">columnName</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">metadata</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Any</span><span class="p">])</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a new :class:`DataFrame` by updating an existing column with metadata.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> columnName : str</span> |
| <span class="sd"> string, name of the existing column to update the metadata.</span> |
| <span class="sd"> metadata : dict</span> |
| <span class="sd"> dict, new metadata to be assigned to df.schema[columnName].metadata</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> DataFrame with updated metadata column.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])</span> |
| <span class="sd"> >>> df_meta = df.withMetadata('age', {'foo': 'bar'})</span> |
| <span class="sd"> >>> df_meta.schema['age'].metadata</span> |
| <span class="sd"> {'foo': 'bar'}</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">drop</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">drop</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.drop"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.drop.html#pyspark.sql.DataFrame.drop">[docs]</a> <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="k">def</span> <span class="nf">drop</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a new :class:`DataFrame` without specified columns.</span> |
| <span class="sd"> This is a no-op if the schema doesn't contain the given column name(s).</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> cols: str or :class:`Column`</span> |
| <span class="sd"> A name of the column, or the :class:`Column` to be dropped.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> A new :class:`DataFrame` without the specified columns.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> - When an input is a column name, it is treated literally without further interpretation.</span> |
| <span class="sd"> Otherwise, it will try to match the equivalent expression.</span> |
| <span class="sd"> So dropping a column by its name `drop(colName)` has a different semantic</span> |
| <span class="sd"> with directly dropping the column `drop(col(colName))`.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> Example 1: Drop a column by name.</span> |
| |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> df.drop('age').show()</span> |
| <span class="sd"> +-----+</span> |
| <span class="sd"> | name|</span> |
| <span class="sd"> +-----+</span> |
| <span class="sd"> | Tom|</span> |
| <span class="sd"> |Alice|</span> |
| <span class="sd"> | Bob|</span> |
| <span class="sd"> +-----+</span> |
| |
| <span class="sd"> Example 2: Drop a column by :class:`Column` object.</span> |
| |
| <span class="sd"> >>> df.drop(df.age).show()</span> |
| <span class="sd"> +-----+</span> |
| <span class="sd"> | name|</span> |
| <span class="sd"> +-----+</span> |
| <span class="sd"> | Tom|</span> |
| <span class="sd"> |Alice|</span> |
| <span class="sd"> | Bob|</span> |
| <span class="sd"> +-----+</span> |
| |
| <span class="sd"> Example 3: Drop the column that joined both DataFrames on.</span> |
| |
| <span class="sd"> >>> df2 = spark.createDataFrame([(80, "Tom"), (85, "Bob")], ["height", "name"])</span> |
| <span class="sd"> >>> df.join(df2, df.name == df2.name).drop('name').sort('age').show()</span> |
| <span class="sd"> +---+------+</span> |
| <span class="sd"> |age|height|</span> |
| <span class="sd"> +---+------+</span> |
| <span class="sd"> | 14| 80|</span> |
| <span class="sd"> | 16| 85|</span> |
| <span class="sd"> +---+------+</span> |
| |
| <span class="sd"> >>> df3 = df.join(df2)</span> |
| <span class="sd"> >>> df3.show()</span> |
| <span class="sd"> +---+-----+------+----+</span> |
| <span class="sd"> |age| name|height|name|</span> |
| <span class="sd"> +---+-----+------+----+</span> |
| <span class="sd"> | 14| Tom| 80| Tom|</span> |
| <span class="sd"> | 14| Tom| 85| Bob|</span> |
| <span class="sd"> | 23|Alice| 80| Tom|</span> |
| <span class="sd"> | 23|Alice| 85| Bob|</span> |
| <span class="sd"> | 16| Bob| 80| Tom|</span> |
| <span class="sd"> | 16| Bob| 85| Bob|</span> |
| <span class="sd"> +---+-----+------+----+</span> |
| |
| <span class="sd"> Example 4: Drop two column by the same name.</span> |
| |
| <span class="sd"> >>> df3.drop("name").show()</span> |
| <span class="sd"> +---+------+</span> |
| <span class="sd"> |age|height|</span> |
| <span class="sd"> +---+------+</span> |
| <span class="sd"> | 14| 80|</span> |
| <span class="sd"> | 14| 85|</span> |
| <span class="sd"> | 23| 80|</span> |
| <span class="sd"> | 23| 85|</span> |
| <span class="sd"> | 16| 80|</span> |
| <span class="sd"> | 16| 85|</span> |
| <span class="sd"> +---+------+</span> |
| |
| <span class="sd"> Example 5: Can not drop col('name') due to ambiguous reference.</span> |
| |
| <span class="sd"> >>> from pyspark.sql import functions as sf</span> |
| <span class="sd"> >>> df3.drop(sf.col("name")).show()</span> |
| <span class="sd"> Traceback (most recent call last):</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> pyspark.errors.exceptions.captured.AnalysisException: [AMBIGUOUS_REFERENCE] Reference...</span> |
| |
| <span class="sd"> Example 6: Can not find a column matching the expression "a.b.c".</span> |
| |
| <span class="sd"> >>> from pyspark.sql import functions as sf</span> |
| <span class="sd"> >>> df4 = df.withColumn("a.b.c", sf.lit(1))</span> |
| <span class="sd"> >>> df4.show()</span> |
| <span class="sd"> +---+-----+-----+</span> |
| <span class="sd"> |age| name|a.b.c|</span> |
| <span class="sd"> +---+-----+-----+</span> |
| <span class="sd"> | 14| Tom| 1|</span> |
| <span class="sd"> | 23|Alice| 1|</span> |
| <span class="sd"> | 16| Bob| 1|</span> |
| <span class="sd"> +---+-----+-----+</span> |
| |
| <span class="sd"> >>> df4.drop("a.b.c").show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |age| name|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 14| Tom|</span> |
| <span class="sd"> | 23|Alice|</span> |
| <span class="sd"> | 16| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| |
| <span class="sd"> >>> df4.drop(sf.col("a.b.c")).show()</span> |
| <span class="sd"> +---+-----+-----+</span> |
| <span class="sd"> |age| name|a.b.c|</span> |
| <span class="sd"> +---+-----+-----+</span> |
| <span class="sd"> | 14| Tom| 1|</span> |
| <span class="sd"> | 23|Alice| 1|</span> |
| <span class="sd"> | 16| Bob| 1|</span> |
| <span class="sd"> +---+-----+-----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.toDF"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.toDF.html#pyspark.sql.DataFrame.toDF">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">toDF</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a new :class:`DataFrame` that with new specified column names</span> |
| |
| <span class="sd"> .. versionadded:: 1.6.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> *cols : tuple</span> |
| <span class="sd"> a tuple of string new column name. The length of the</span> |
| <span class="sd"> list needs to be the same as the number of columns in the initial</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> DataFrame with new column names.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(14, "Tom"), (23, "Alice"),</span> |
| <span class="sd"> ... (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> df.toDF('f1', 'f2').show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | f1| f2|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 14| Tom|</span> |
| <span class="sd"> | 23|Alice|</span> |
| <span class="sd"> | 16| Bob|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.transform"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.transform.html#pyspark.sql.DataFrame.transform">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">transform</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">func</span><span class="p">:</span> <span class="n">Callable</span><span class="p">[</span><span class="o">...</span><span class="p">,</span> <span class="s2">"DataFrame"</span><span class="p">],</span> <span class="o">*</span><span class="n">args</span><span class="p">:</span> <span class="n">Any</span><span class="p">,</span> <span class="o">**</span><span class="n">kwargs</span><span class="p">:</span> <span class="n">Any</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Returns a new :class:`DataFrame`. Concise syntax for chaining custom transformations.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> func : function</span> |
| <span class="sd"> a function that takes and returns a :class:`DataFrame`.</span> |
| <span class="sd"> *args</span> |
| <span class="sd"> Positional arguments to pass to func.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| <span class="sd"> **kwargs</span> |
| <span class="sd"> Keyword arguments to pass to func.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrame`</span> |
| <span class="sd"> Transformed DataFrame.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> from pyspark.sql.functions import col</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, 1.0), (2, 2.0)], ["int", "float"])</span> |
| <span class="sd"> >>> def cast_all_to_int(input_df):</span> |
| <span class="sd"> ... return input_df.select([col(col_name).cast("int") for col_name in input_df.columns])</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> def sort_columns_asc(input_df):</span> |
| <span class="sd"> ... return input_df.select(*sorted(input_df.columns))</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> df.transform(cast_all_to_int).transform(sort_columns_asc).show()</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> |float|int|</span> |
| <span class="sd"> +-----+---+</span> |
| <span class="sd"> | 1| 1|</span> |
| <span class="sd"> | 2| 2|</span> |
| <span class="sd"> +-----+---+</span> |
| |
| <span class="sd"> >>> def add_n(input_df, n):</span> |
| <span class="sd"> ... return input_df.select([(col(col_name) + n).alias(col_name)</span> |
| <span class="sd"> ... for col_name in input_df.columns])</span> |
| <span class="sd"> >>> df.transform(add_n, 1).transform(add_n, n=10).show()</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> |int|float|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> | 12| 12.0|</span> |
| <span class="sd"> | 13| 13.0|</span> |
| <span class="sd"> +---+-----+</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.sameSemantics"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.sameSemantics.html#pyspark.sql.DataFrame.sameSemantics">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">sameSemantics</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">other</span><span class="p">:</span> <span class="s2">"DataFrame"</span><span class="p">)</span> <span class="o">-></span> <span class="nb">bool</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns `True` when the logical query plans inside both :class:`DataFrame`\\s are equal and</span> |
| <span class="sd"> therefore return the same results.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.5.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> The equality comparison here is simplified by tolerating the cosmetic differences</span> |
| <span class="sd"> such as attribute names.</span> |
| |
| <span class="sd"> This API can compare both :class:`DataFrame`\\s very fast but can still return</span> |
| <span class="sd"> `False` on the :class:`DataFrame` that return the same results, for instance, from</span> |
| <span class="sd"> different plans. Such false negative semantic can be useful when caching as an example.</span> |
| |
| <span class="sd"> This API is a developer API.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> other : :class:`DataFrame`</span> |
| <span class="sd"> The other DataFrame to compare against.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> bool</span> |
| <span class="sd"> Whether these two DataFrames are similar.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df1 = spark.range(10)</span> |
| <span class="sd"> >>> df2 = spark.range(10)</span> |
| <span class="sd"> >>> df1.withColumn("col1", df1.id * 2).sameSemantics(df2.withColumn("col1", df2.id * 2))</span> |
| <span class="sd"> True</span> |
| <span class="sd"> >>> df1.withColumn("col1", df1.id * 2).sameSemantics(df2.withColumn("col1", df2.id + 2))</span> |
| <span class="sd"> False</span> |
| <span class="sd"> >>> df1.withColumn("col1", df1.id * 2).sameSemantics(df2.withColumn("col0", df2.id * 2))</span> |
| <span class="sd"> True</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.semanticHash"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.semanticHash.html#pyspark.sql.DataFrame.semanticHash">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">semanticHash</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="nb">int</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a hash code of the logical query plan against this :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.5.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> Unlike the standard hash code, the hash is calculated against the query plan</span> |
| <span class="sd"> simplified by tolerating the cosmetic differences such as attribute names.</span> |
| |
| <span class="sd"> This API is a developer API.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> int</span> |
| <span class="sd"> Hash value.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> spark.range(10).selectExpr("id as col0").semanticHash() # doctest: +SKIP</span> |
| <span class="sd"> 1855039936</span> |
| <span class="sd"> >>> spark.range(10).selectExpr("id as col1").semanticHash() # doctest: +SKIP</span> |
| <span class="sd"> 1855039936</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.inputFiles"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.inputFiles.html#pyspark.sql.DataFrame.inputFiles">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">inputFiles</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns a best-effort snapshot of the files that compose this :class:`DataFrame`.</span> |
| <span class="sd"> This method simply asks each constituent BaseRelation for its respective files and</span> |
| <span class="sd"> takes the union of all results. Depending on the source relations, this may not find</span> |
| <span class="sd"> all input files. Duplicates are removed.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> list</span> |
| <span class="sd"> List of file paths.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import tempfile</span> |
| <span class="sd"> >>> with tempfile.TemporaryDirectory(prefix="inputFiles") as d:</span> |
| <span class="sd"> ... # Write a single-row DataFrame into a JSON file</span> |
| <span class="sd"> ... spark.createDataFrame(</span> |
| <span class="sd"> ... [{"age": 100, "name": "Hyukjin Kwon"}]</span> |
| <span class="sd"> ... ).repartition(1).write.json(d, mode="overwrite")</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> ... # Read the JSON file as a DataFrame.</span> |
| <span class="sd"> ... df = spark.read.format("json").load(d)</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> ... # Returns the number of input files.</span> |
| <span class="sd"> ... len(df.inputFiles())</span> |
| <span class="sd"> 1</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.where"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.where.html#pyspark.sql.DataFrame.where">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">where</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">condition</span><span class="p">:</span> <span class="s2">"ColumnOrName"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> :func:`where` is an alias for :func:`filter`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <span class="c1"># Two aliases below were added for pandas compatibility many years ago.</span> |
| <span class="c1"># There are too many differences compared to pandas and we cannot just</span> |
| <span class="c1"># make it "compatible" by adding aliases. Therefore, we stop adding such</span> |
| <span class="c1"># aliases as of Spark 3.0. Two methods below remain just</span> |
| <span class="c1"># for legacy users currently.</span> |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">groupby</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrNameOrOrdinal"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GroupedData"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">groupby</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">__cols</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="n">Column</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">]])</span> <span class="o">-></span> <span class="s2">"GroupedData"</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="k">def</span> <span class="nf">groupby</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">*</span><span class="n">cols</span><span class="p">:</span> <span class="s2">"ColumnOrNameOrOrdinal"</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"GroupedData"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> :func:`groupby` is an alias for :func:`groupBy`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrame.drop_duplicates"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.drop_duplicates.html#pyspark.sql.DataFrame.drop_duplicates">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">drop_duplicates</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> :func:`drop_duplicates` is an alias for :func:`dropDuplicates`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.writeTo"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.writeTo.html#pyspark.sql.DataFrame.writeTo">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">writeTo</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">table</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrameWriterV2</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Create a write configuration builder for v2 sources.</span> |
| |
| <span class="sd"> This builder is used to configure and execute write operations.</span> |
| |
| <span class="sd"> For example, to append or create or replace existing tables.</span> |
| |
| <span class="sd"> .. versionadded:: 3.1.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> table : str</span> |
| <span class="sd"> Target table name to write to.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`DataFrameWriterV2`</span> |
| <span class="sd"> DataFrameWriterV2 to use further to specify how to save the data</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| <span class="sd"> >>> df.writeTo("catalog.db.table").append() # doctest: +SKIP</span> |
| <span class="sd"> >>> df.writeTo( # doctest: +SKIP</span> |
| <span class="sd"> ... "catalog.db.table"</span> |
| <span class="sd"> ... ).partitionedBy("col").createOrReplace()</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.pandas_api"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.pandas_api.html#pyspark.sql.DataFrame.pandas_api">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">pandas_api</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">index_col</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"PandasOnSparkDataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Converts the existing DataFrame into a pandas-on-Spark DataFrame.</span> |
| |
| <span class="sd"> .. versionadded:: 3.2.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.5.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> If a pandas-on-Spark DataFrame is converted to a Spark DataFrame and then back</span> |
| <span class="sd"> to pandas-on-Spark, it will lose the index information and the original index</span> |
| <span class="sd"> will be turned into a normal column.</span> |
| |
| <span class="sd"> This is only available if Pandas is installed and available.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> index_col: str or list of str, optional</span> |
| <span class="sd"> Index column of table in Spark.</span> |
| |
| <span class="sd"> Returns</span> |
| <span class="sd"> -------</span> |
| <span class="sd"> :class:`PandasOnSparkDataFrame`</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.pandas.frame.DataFrame.to_spark</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame(</span> |
| <span class="sd"> ... [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])</span> |
| |
| <span class="sd"> >>> df.pandas_api() # doctest: +SKIP</span> |
| <span class="sd"> age name</span> |
| <span class="sd"> 0 14 Tom</span> |
| <span class="sd"> 1 23 Alice</span> |
| <span class="sd"> 2 16 Bob</span> |
| |
| <span class="sd"> We can specify the index columns.</span> |
| |
| <span class="sd"> >>> df.pandas_api(index_col="age") # doctest: +SKIP</span> |
| <span class="sd"> name</span> |
| <span class="sd"> age</span> |
| <span class="sd"> 14 Tom</span> |
| <span class="sd"> 23 Alice</span> |
| <span class="sd"> 16 Bob</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.mapInPandas"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.mapInPandas.html#pyspark.sql.DataFrame.mapInPandas">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">mapInPandas</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">func</span><span class="p">:</span> <span class="s2">"PandasMapIterFunction"</span><span class="p">,</span> |
| <span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">StructType</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> |
| <span class="n">barrier</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">profile</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">ResourceProfile</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Maps an iterator of batches in the current :class:`DataFrame` using a Python native</span> |
| <span class="sd"> function that is performed on pandas DataFrames both as input and output,</span> |
| <span class="sd"> and returns the result as a :class:`DataFrame`.</span> |
| |
| <span class="sd"> This method applies the specified Python function to an iterator of</span> |
| <span class="sd"> `pandas.DataFrame`\\s, each representing a batch of rows from the original DataFrame.</span> |
| <span class="sd"> The returned iterator of `pandas.DataFrame`\\s are combined as a :class:`DataFrame`.</span> |
| <span class="sd"> The size of the function's input and output can be different. Each `pandas.DataFrame`</span> |
| <span class="sd"> size can be controlled by `spark.sql.execution.arrow.maxRecordsPerBatch`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.0.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> func : function</span> |
| <span class="sd"> a Python native function that takes an iterator of `pandas.DataFrame`\\s, and</span> |
| <span class="sd"> outputs an iterator of `pandas.DataFrame`\\s.</span> |
| <span class="sd"> schema : :class:`pyspark.sql.types.DataType` or str</span> |
| <span class="sd"> the return type of the `func` in PySpark. The value can be either a</span> |
| <span class="sd"> :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.</span> |
| <span class="sd"> barrier : bool, optional, default False</span> |
| <span class="sd"> Use barrier mode execution, ensuring that all Python workers in the stage will be</span> |
| <span class="sd"> launched concurrently.</span> |
| |
| <span class="sd"> .. versionadded: 3.5.0</span> |
| |
| <span class="sd"> profile : :class:`pyspark.resource.ResourceProfile`. The optional ResourceProfile</span> |
| <span class="sd"> to be used for mapInPandas.</span> |
| |
| <span class="sd"> .. versionadded: 4.0.0</span> |
| |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age"))</span> |
| |
| <span class="sd"> Filter rows with id equal to 1:</span> |
| |
| <span class="sd"> >>> def filter_func(iterator):</span> |
| <span class="sd"> ... for pdf in iterator:</span> |
| <span class="sd"> ... yield pdf[pdf.id == 1]</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> df.mapInPandas(filter_func, df.schema).show() # doctest: +SKIP</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | id|age|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | 1| 21|</span> |
| <span class="sd"> +---+---+</span> |
| |
| <span class="sd"> Compute the mean age for each id:</span> |
| |
| <span class="sd"> >>> def mean_age(iterator):</span> |
| <span class="sd"> ... for pdf in iterator:</span> |
| <span class="sd"> ... yield pdf.groupby("id").mean().reset_index()</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> df.mapInPandas(mean_age, "id: bigint, age: double").show() # doctest: +SKIP</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> | id| age|</span> |
| <span class="sd"> +---+----+</span> |
| <span class="sd"> | 1|21.0|</span> |
| <span class="sd"> | 2|30.0|</span> |
| <span class="sd"> +---+----+</span> |
| |
| <span class="sd"> Add a new column with the double of the age:</span> |
| |
| <span class="sd"> >>> def double_age(iterator):</span> |
| <span class="sd"> ... for pdf in iterator:</span> |
| <span class="sd"> ... pdf["double_age"] = pdf["age"] * 2</span> |
| <span class="sd"> ... yield pdf</span> |
| <span class="sd"> ...</span> |
| <span class="sd"> >>> df.mapInPandas(</span> |
| <span class="sd"> ... double_age, "id: bigint, age: bigint, double_age: bigint").show() # doctest: +SKIP</span> |
| <span class="sd"> +---+---+----------+</span> |
| <span class="sd"> | id|age|double_age|</span> |
| <span class="sd"> +---+---+----------+</span> |
| <span class="sd"> | 1| 21| 42|</span> |
| <span class="sd"> | 2| 30| 60|</span> |
| <span class="sd"> +---+---+----------+</span> |
| |
| <span class="sd"> Set ``barrier`` to ``True`` to force the ``mapInPandas`` stage running in the</span> |
| <span class="sd"> barrier mode, it ensures all Python workers in the stage will be</span> |
| <span class="sd"> launched concurrently.</span> |
| |
| <span class="sd"> >>> df.mapInPandas(filter_func, df.schema, barrier=True).show() # doctest: +SKIP</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | id|age|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | 1| 21|</span> |
| <span class="sd"> +---+---+</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This API is experimental</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.sql.functions.pandas_udf</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.mapInArrow"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.mapInArrow.html#pyspark.sql.DataFrame.mapInArrow">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">mapInArrow</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">func</span><span class="p">:</span> <span class="s2">"ArrowMapIterFunction"</span><span class="p">,</span> |
| <span class="n">schema</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">StructType</span><span class="p">,</span> <span class="nb">str</span><span class="p">],</span> |
| <span class="n">barrier</span><span class="p">:</span> <span class="nb">bool</span> <span class="o">=</span> <span class="kc">False</span><span class="p">,</span> |
| <span class="n">profile</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">ResourceProfile</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="s2">"DataFrame"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Maps an iterator of batches in the current :class:`DataFrame` using a Python native</span> |
| <span class="sd"> function that is performed on `pyarrow.RecordBatch`\\s both as input and output,</span> |
| <span class="sd"> and returns the result as a :class:`DataFrame`.</span> |
| |
| <span class="sd"> This method applies the specified Python function to an iterator of</span> |
| <span class="sd"> `pyarrow.RecordBatch`\\s, each representing a batch of rows from the original DataFrame.</span> |
| <span class="sd"> The returned iterator of `pyarrow.RecordBatch`\\s are combined as a :class:`DataFrame`.</span> |
| <span class="sd"> The size of the function's input and output can be different. Each `pyarrow.RecordBatch`</span> |
| <span class="sd"> size can be controlled by `spark.sql.execution.arrow.maxRecordsPerBatch`.</span> |
| |
| <span class="sd"> .. versionadded:: 3.3.0</span> |
| |
| <span class="sd"> Parameters</span> |
| <span class="sd"> ----------</span> |
| <span class="sd"> func : function</span> |
| <span class="sd"> a Python native function that takes an iterator of `pyarrow.RecordBatch`\\s, and</span> |
| <span class="sd"> outputs an iterator of `pyarrow.RecordBatch`\\s.</span> |
| <span class="sd"> schema : :class:`pyspark.sql.types.DataType` or str</span> |
| <span class="sd"> the return type of the `func` in PySpark. The value can be either a</span> |
| <span class="sd"> :class:`pyspark.sql.types.DataType` object or a DDL-formatted type string.</span> |
| <span class="sd"> barrier : bool, optional, default False</span> |
| <span class="sd"> Use barrier mode execution, ensuring that all Python workers in the stage will be</span> |
| <span class="sd"> launched concurrently.</span> |
| |
| <span class="sd"> .. versionadded: 3.5.0</span> |
| |
| <span class="sd"> profile : :class:`pyspark.resource.ResourceProfile`. The optional ResourceProfile</span> |
| <span class="sd"> to be used for mapInArrow.</span> |
| |
| <span class="sd"> .. versionadded: 4.0.0</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> import pyarrow # doctest: +SKIP</span> |
| <span class="sd"> >>> df = spark.createDataFrame([(1, 21), (2, 30)], ("id", "age"))</span> |
| <span class="sd"> >>> def filter_func(iterator):</span> |
| <span class="sd"> ... for batch in iterator:</span> |
| <span class="sd"> ... pdf = batch.to_pandas()</span> |
| <span class="sd"> ... yield pyarrow.RecordBatch.from_pandas(pdf[pdf.id == 1])</span> |
| <span class="sd"> >>> df.mapInArrow(filter_func, df.schema).show() # doctest: +SKIP</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | id|age|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | 1| 21|</span> |
| <span class="sd"> +---+---+</span> |
| |
| <span class="sd"> Set ``barrier`` to ``True`` to force the ``mapInArrow`` stage running in the</span> |
| <span class="sd"> barrier mode, it ensures all Python workers in the stage will be</span> |
| <span class="sd"> launched concurrently.</span> |
| |
| <span class="sd"> >>> df.mapInArrow(filter_func, df.schema, barrier=True).show() # doctest: +SKIP</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | id|age|</span> |
| <span class="sd"> +---+---+</span> |
| <span class="sd"> | 1| 21|</span> |
| <span class="sd"> +---+---+</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This API is unstable, and for developers.</span> |
| |
| <span class="sd"> See Also</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> pyspark.sql.functions.pandas_udf</span> |
| <span class="sd"> pyspark.sql.DataFrame.mapInPandas</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.toArrow"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.toArrow.html#pyspark.sql.DataFrame.toArrow">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">toArrow</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"pa.Table"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the contents of this :class:`DataFrame` as PyArrow ``pyarrow.Table``.</span> |
| |
| <span class="sd"> This is only available if PyArrow is installed and available.</span> |
| |
| <span class="sd"> .. versionadded:: 4.0.0</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This method should only be used if the resulting PyArrow ``pyarrow.Table`` is</span> |
| <span class="sd"> expected to be small, as all the data is loaded into the driver's memory.</span> |
| |
| <span class="sd"> This API is a developer API.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.toArrow() # doctest: +SKIP</span> |
| <span class="sd"> pyarrow.Table</span> |
| <span class="sd"> age: int64</span> |
| <span class="sd"> name: string</span> |
| <span class="sd"> ----</span> |
| <span class="sd"> age: [[2,5]]</span> |
| <span class="sd"> name: [["Alice","Bob"]]</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div> |
| |
| <div class="viewcode-block" id="DataFrame.toPandas"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrame.toPandas.html#pyspark.sql.DataFrame.toPandas">[docs]</a> <span class="k">def</span> <span class="nf">toPandas</span><span class="p">(</span><span class="bp">self</span><span class="p">)</span> <span class="o">-></span> <span class="s2">"PandasDataFrameLike"</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""</span> |
| <span class="sd"> Returns the contents of this :class:`DataFrame` as Pandas ``pandas.DataFrame``.</span> |
| |
| <span class="sd"> This is only available if Pandas is installed and available.</span> |
| |
| <span class="sd"> .. versionadded:: 1.3.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| |
| <span class="sd"> Notes</span> |
| <span class="sd"> -----</span> |
| <span class="sd"> This method should only be used if the resulting Pandas ``pandas.DataFrame`` is</span> |
| <span class="sd"> expected to be small, as all the data is loaded into the driver's memory.</span> |
| |
| <span class="sd"> Usage with ``spark.sql.execution.arrow.pyspark.enabled=True`` is experimental.</span> |
| |
| <span class="sd"> Examples</span> |
| <span class="sd"> --------</span> |
| <span class="sd"> >>> df.toPandas() # doctest: +SKIP</span> |
| <span class="sd"> age name</span> |
| <span class="sd"> 0 2 Alice</span> |
| <span class="sd"> 1 5 Bob</span> |
| <span class="sd"> """</span> |
| <span class="o">...</span></div></div> |
| |
| |
| <div class="viewcode-block" id="DataFrameNaFunctions"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameNaFunctions.html#pyspark.sql.DataFrameNaFunctions">[docs]</a><span class="k">class</span> <span class="nc">DataFrameNaFunctions</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Functionality for working with missing data in :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">df</span> <span class="o">=</span> <span class="n">df</span> |
| |
| <div class="viewcode-block" id="DataFrameNaFunctions.drop"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameNaFunctions.drop.html#pyspark.sql.DataFrameNaFunctions.drop">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">drop</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">how</span><span class="p">:</span> <span class="nb">str</span> <span class="o">=</span> <span class="s2">"any"</span><span class="p">,</span> |
| <span class="n">thresh</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="o">...</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span></div> |
| |
| <span class="n">drop</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">dropna</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">fill</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="s2">"LiteralType"</span><span class="p">,</span> <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">fill</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">value</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="s2">"LiteralType"</span><span class="p">])</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrameNaFunctions.fill"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameNaFunctions.fill.html#pyspark.sql.DataFrameNaFunctions.fill">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">fill</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">value</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="s2">"LiteralType"</span><span class="p">,</span> <span class="n">Dict</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="s2">"LiteralType"</span><span class="p">]],</span> |
| <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span></div> |
| |
| <span class="n">fill</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">fillna</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">replace</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">to_replace</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">"LiteralType"</span><span class="p">],</span> |
| <span class="n">value</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">"OptionalPrimitiveType"</span><span class="p">],</span> |
| <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">replace</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">to_replace</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="s2">"LiteralType"</span><span class="p">,</span> <span class="s2">"OptionalPrimitiveType"</span><span class="p">],</span> |
| <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">replace</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">to_replace</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="s2">"LiteralType"</span><span class="p">],</span> |
| <span class="n">value</span><span class="p">:</span> <span class="s2">"OptionalPrimitiveType"</span><span class="p">,</span> |
| <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="o">...</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrameNaFunctions.replace"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameNaFunctions.replace.html#pyspark.sql.DataFrameNaFunctions.replace">[docs]</a> <span class="nd">@dispatch_df_method</span> <span class="c1"># type: ignore[misc]</span> |
| <span class="k">def</span> <span class="nf">replace</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">to_replace</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="s2">"LiteralType"</span><span class="p">],</span> <span class="n">Dict</span><span class="p">[</span><span class="s2">"LiteralType"</span><span class="p">,</span> <span class="s2">"OptionalPrimitiveType"</span><span class="p">]],</span> |
| <span class="n">value</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span> |
| <span class="n">Union</span><span class="p">[</span><span class="s2">"OptionalPrimitiveType"</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="s2">"OptionalPrimitiveType"</span><span class="p">],</span> <span class="n">_NoValueType</span><span class="p">]</span> |
| <span class="p">]</span> <span class="o">=</span> <span class="n">_NoValue</span><span class="p">,</span> |
| <span class="n">subset</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">]]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span></div> |
| |
| <span class="n">replace</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">replace</span><span class="o">.</span><span class="vm">__doc__</span></div> |
| |
| |
| <div class="viewcode-block" id="DataFrameStatFunctions"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameStatFunctions.html#pyspark.sql.DataFrameStatFunctions">[docs]</a><span class="k">class</span> <span class="nc">DataFrameStatFunctions</span><span class="p">:</span> |
| <span class="w"> </span><span class="sd">"""Functionality for statistic functions with :class:`DataFrame`.</span> |
| |
| <span class="sd"> .. versionadded:: 1.4.0</span> |
| |
| <span class="sd"> .. versionchanged:: 3.4.0</span> |
| <span class="sd"> Supports Spark Connect.</span> |
| <span class="sd"> """</span> |
| |
| <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">df</span><span class="p">:</span> <span class="n">DataFrame</span><span class="p">):</span> |
| <span class="bp">self</span><span class="o">.</span><span class="n">df</span> <span class="o">=</span> <span class="n">df</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">approxQuantile</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">col</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> |
| <span class="n">probabilities</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">float</span><span class="p">]],</span> |
| <span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]:</span> |
| <span class="o">...</span> |
| |
| <span class="nd">@overload</span> |
| <span class="k">def</span> <span class="nf">approxQuantile</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">col</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">]],</span> |
| <span class="n">probabilities</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">float</span><span class="p">]],</span> |
| <span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]:</span> |
| <span class="o">...</span> |
| |
| <div class="viewcode-block" id="DataFrameStatFunctions.approxQuantile"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameStatFunctions.approxQuantile.html#pyspark.sql.DataFrameStatFunctions.approxQuantile">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">approxQuantile</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> |
| <span class="n">col</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="nb">str</span><span class="p">,</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">str</span><span class="p">]],</span> |
| <span class="n">probabilities</span><span class="p">:</span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">Tuple</span><span class="p">[</span><span class="nb">float</span><span class="p">]],</span> |
| <span class="n">relativeError</span><span class="p">:</span> <span class="nb">float</span><span class="p">,</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">Union</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">],</span> <span class="n">List</span><span class="p">[</span><span class="n">List</span><span class="p">[</span><span class="nb">float</span><span class="p">]]]:</span> |
| <span class="o">...</span></div> |
| |
| <span class="n">approxQuantile</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">approxQuantile</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="DataFrameStatFunctions.corr"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameStatFunctions.corr.html#pyspark.sql.DataFrameStatFunctions.corr">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">corr</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col1</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">method</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">str</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="o">...</span></div> |
| |
| <span class="n">corr</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">corr</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="DataFrameStatFunctions.cov"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameStatFunctions.cov.html#pyspark.sql.DataFrameStatFunctions.cov">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">cov</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col1</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="o">...</span></div> |
| |
| <span class="n">cov</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">cov</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="DataFrameStatFunctions.crosstab"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameStatFunctions.crosstab.html#pyspark.sql.DataFrameStatFunctions.crosstab">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">crosstab</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">col1</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">col2</span><span class="p">:</span> <span class="nb">str</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span></div> |
| |
| <span class="n">crosstab</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">crosstab</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="DataFrameStatFunctions.freqItems"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameStatFunctions.freqItems.html#pyspark.sql.DataFrameStatFunctions.freqItems">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">freqItems</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="n">cols</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">str</span><span class="p">],</span> <span class="n">support</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">float</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span><span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span></div> |
| |
| <span class="n">freqItems</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">freqItems</span><span class="o">.</span><span class="vm">__doc__</span> |
| |
| <div class="viewcode-block" id="DataFrameStatFunctions.sampleBy"><a class="viewcode-back" href="../../../reference/pyspark.sql/api/pyspark.sql.DataFrameStatFunctions.sampleBy.html#pyspark.sql.DataFrameStatFunctions.sampleBy">[docs]</a> <span class="nd">@dispatch_df_method</span> |
| <span class="k">def</span> <span class="nf">sampleBy</span><span class="p">(</span> |
| <span class="bp">self</span><span class="p">,</span> <span class="n">col</span><span class="p">:</span> <span class="nb">str</span><span class="p">,</span> <span class="n">fractions</span><span class="p">:</span> <span class="n">Dict</span><span class="p">[</span><span class="n">Any</span><span class="p">,</span> <span class="nb">float</span><span class="p">],</span> <span class="n">seed</span><span class="p">:</span> <span class="n">Optional</span><span class="p">[</span><span class="nb">int</span><span class="p">]</span> <span class="o">=</span> <span class="kc">None</span> |
| <span class="p">)</span> <span class="o">-></span> <span class="n">DataFrame</span><span class="p">:</span> |
| <span class="o">...</span></div> |
| |
| <span class="n">sampleBy</span><span class="o">.</span><span class="vm">__doc__</span> <span class="o">=</span> <span class="n">DataFrame</span><span class="o">.</span><span class="n">sampleBy</span><span class="o">.</span><span class="vm">__doc__</span></div> |
| </pre></div> |
| |
| </article> |
| |
| |
| |
| <footer class="bd-footer-article"> |
| |
| <div class="footer-article-items footer-article__inner"> |
| |
| <div class="footer-article-item"><!-- Previous / next buttons --> |
| <div class="prev-next-area"> |
| </div></div> |
| |
| </div> |
| |
| </footer> |
| |
| </div> |
| |
| |
| |
| |
| </div> |
| <footer class="bd-footer-content"> |
| |
| </footer> |
| |
| </main> |
| </div> |
| </div> |
| |
| <!-- Scripts loaded after <body> so the DOM is not blocked --> |
| <script src="../../../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script> |
| <script src="../../../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script> |
| |
| <footer class="bd-footer"> |
| <div class="bd-footer__inner bd-page-width"> |
| |
| <div class="footer-items__start"> |
| |
| <div class="footer-item"><p class="copyright"> |
| Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. |
| </p></div> |
| |
| <div class="footer-item"> |
| <p class="sphinx-version"> |
| Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0. |
| <br/> |
| </p> |
| </div> |
| |
| </div> |
| |
| |
| <div class="footer-items__end"> |
| |
| <div class="footer-item"><p class="theme-version"> |
| Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3. |
| </p></div> |
| |
| </div> |
| |
| </div> |
| |
| </footer> |
| </body> |
| </html> |