| |
| |
| <!DOCTYPE html> |
| |
| |
| <html > |
| |
| <head> |
| <meta charset="utf-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" /> |
| |
| <title>Upgrading PySpark — PySpark 4.0.0-preview1 documentation</title> |
| |
| |
| |
| <script data-cfasync="false"> |
| document.documentElement.dataset.mode = localStorage.getItem("mode") || ""; |
| document.documentElement.dataset.theme = localStorage.getItem("theme") || "light"; |
| </script> |
| |
| <!-- Loaded before other Sphinx assets --> |
| <link href="../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| |
| |
| <link href="../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" /> |
| |
| <link rel="stylesheet" type="text/css" href="../_static/pygments.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/copybutton.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/css/pyspark.css" /> |
| |
| <!-- Pre-loaded scripts that we'll load fully later --> |
| <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=e353d410970836974a52" /> |
| <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" /> |
| |
| <script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script> |
| <script src="../_static/jquery.js"></script> |
| <script src="../_static/underscore.js"></script> |
| <script src="../_static/doctools.js"></script> |
| <script src="../_static/clipboard.min.js"></script> |
| <script src="../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script>DOCUMENTATION_OPTIONS.pagename = 'migration_guide/pyspark_upgrade';</script> |
| <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/migration_guide/pyspark_upgrade.html" /> |
| <link rel="search" title="Search" href="../search.html" /> |
| <link rel="next" title="Migrating from Koalas to pandas API on Spark" href="koalas_to_pyspark.html" /> |
| <link rel="prev" title="Migration Guides" href="index.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="None"> |
| |
| |
| <!-- Matomo --> |
| <script type="text/javascript"> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| </head> |
| |
| |
| <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode=""> |
| |
| |
| |
| <a class="skip-link" href="#main-content">Skip to main content</a> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__primary" |
| id="__primary"/> |
| <label class="overlay overlay-primary" for="__primary"></label> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__secondary" |
| id="__secondary"/> |
| <label class="overlay overlay-secondary" for="__secondary"></label> |
| |
| <div class="search-button__wrapper"> |
| <div class="search-button__overlay"></div> |
| <div class="search-button__search-container"> |
| <form class="bd-search d-flex align-items-center" |
| action="../search.html" |
| method="get"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| <input type="search" |
| class="form-control" |
| name="q" |
| id="search-input" |
| placeholder="Search the docs ..." |
| aria-label="Search the docs ..." |
| autocomplete="off" |
| autocorrect="off" |
| autocapitalize="off" |
| spellcheck="false"/> |
| <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span> |
| </form></div> |
| </div> |
| |
| <nav class="bd-header navbar navbar-expand-lg bd-navbar"> |
| <div class="bd-header__inner bd-page-width"> |
| <label class="sidebar-toggle primary-toggle" for="__primary"> |
| <span class="fa-solid fa-bars"></span> |
| </label> |
| |
| <div class="navbar-header-items__start"> |
| |
| <div class="navbar-item"> |
| |
| |
| <a class="navbar-brand logo" href="../index.html"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <img src="../_static/spark-logo-light.png" class="logo__image only-light" alt="Logo image"/> |
| <script>document.write(`<img src="../_static/spark-logo-dark.png" class="logo__image only-dark" alt="Logo image"/>`);</script> |
| |
| |
| </a></div> |
| |
| </div> |
| |
| |
| <div class="col-lg-9 navbar-header-items"> |
| |
| <div class="me-auto navbar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item current active"> |
| <a class="nav-link nav-internal" href="index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| <div class="navbar-header-items__end"> |
| |
| <div class="navbar-item navbar-persistent--container"> |
| |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview1 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "migration_guide/pyspark_upgrade.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="navbar-persistent--mobile"> |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| |
| <label class="sidebar-toggle secondary-toggle" for="__secondary"> |
| <span class="fa-solid fa-outdent"></span> |
| </label> |
| |
| </div> |
| |
| </nav> |
| |
| <div class="bd-container"> |
| <div class="bd-container__inner bd-page-width"> |
| |
| <div class="bd-sidebar-primary bd-sidebar"> |
| |
| |
| |
| <div class="sidebar-header-items sidebar-primary__section"> |
| |
| |
| <div class="sidebar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item current active"> |
| <a class="nav-link nav-internal" href="index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| |
| <div class="sidebar-header-items__end"> |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview1 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "migration_guide/pyspark_upgrade.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| <div class="sidebar-primary-items__start sidebar-primary__section"> |
| <div class="sidebar-primary-item"><nav class="bd-docs-nav bd-links" |
| aria-label="Section Navigation"> |
| <p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p> |
| <div class="bd-toc-item navbar-nav"><ul class="current nav bd-sidenav"> |
| <li class="toctree-l1 current active"><a class="current reference internal" href="#">Upgrading PySpark</a></li> |
| </ul> |
| <ul class="nav bd-sidenav"> |
| <li class="toctree-l1"><a class="reference internal" href="koalas_to_pyspark.html">Migrating from Koalas to pandas API on Spark</a></li> |
| </ul> |
| </div> |
| </nav></div> |
| </div> |
| |
| |
| <div class="sidebar-primary-items__end sidebar-primary__section"> |
| </div> |
| |
| <div id="rtd-footer-container"></div> |
| |
| |
| </div> |
| |
| <main id="main-content" class="bd-main"> |
| |
| |
| <div class="bd-content"> |
| <div class="bd-article-container"> |
| |
| <div class="bd-header-article"> |
| <div class="header-article-items header-article__inner"> |
| |
| <div class="header-article-items__start"> |
| |
| <div class="header-article-item"> |
| |
| |
| |
| <nav aria-label="Breadcrumbs"> |
| <ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb"> |
| |
| <li class="breadcrumb-item breadcrumb-home"> |
| <a href="../index.html" class="nav-link" aria-label="Home"> |
| <i class="fa-solid fa-home"></i> |
| </a> |
| </li> |
| |
| <li class="breadcrumb-item"><a href="index.html" class="nav-link">Migration Guides</a></li> |
| |
| <li class="breadcrumb-item active" aria-current="page">Upgrading PySpark</li> |
| </ul> |
| </nav> |
| </div> |
| |
| </div> |
| |
| |
| </div> |
| </div> |
| |
| |
| |
| |
| <div id="searchbox"></div> |
| <article class="bd-article" role="main"> |
| |
| <section id="upgrading-pyspark"> |
| <h1>Upgrading PySpark<a class="headerlink" href="#upgrading-pyspark" title="Permalink to this headline">#</a></h1> |
| <section id="upgrading-from-pyspark-3-5-to-4-0"> |
| <h2>Upgrading from PySpark 3.5 to 4.0<a class="headerlink" href="#upgrading-from-pyspark-3-5-to-4-0" title="Permalink to this headline">#</a></h2> |
| <ul class="simple"> |
| <li><p>In Spark 4.0, Python 3.8 support was dropped in PySpark.</p></li> |
| <li><p>In Spark 4.0, the minimum supported version for Pandas has been raised from 1.0.5 to 2.0.0 in PySpark.</p></li> |
| <li><p>In Spark 4.0, the minimum supported version for Numpy has been raised from 1.15 to 1.21 in PySpark.</p></li> |
| <li><p>In Spark 4.0, the minimum supported version for PyArrow has been raised from 4.0.0 to 10.0.0 in PySpark.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">Int64Index</span></code> and <code class="docutils literal notranslate"><span class="pre">Float64Index</span></code> have been removed from pandas API on Spark, <code class="docutils literal notranslate"><span class="pre">Index</span></code> should be used directly.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">DataFrame.iteritems</span></code> has been removed from pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">DataFrame.items</span></code> instead.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">Series.iteritems</span></code> has been removed from pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">Series.items</span></code> instead.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">DataFrame.append</span></code> has been removed from pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">ps.concat</span></code> instead.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">Series.append</span></code> has been removed from pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">ps.concat</span></code> instead.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">DataFrame.mad</span></code> has been removed from pandas API on Spark.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">Series.mad</span></code> has been removed from pandas API on Spark.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">na_sentinel</span></code> parameter from <code class="docutils literal notranslate"><span class="pre">Index.factorize</span></code> and <code class="docutils literal notranslate"><span class="pre">Series.factorize</span></code> has been removed from pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">use_na_sentinel</span></code> instead.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">inplace</span></code> parameter from <code class="docutils literal notranslate"><span class="pre">Categorical.add_categories</span></code>, <code class="docutils literal notranslate"><span class="pre">Categorical.remove_categories</span></code>, <code class="docutils literal notranslate"><span class="pre">Categorical.set_categories</span></code>, <code class="docutils literal notranslate"><span class="pre">Categorical.rename_categories</span></code>, <code class="docutils literal notranslate"><span class="pre">Categorical.reorder_categories</span></code>, <code class="docutils literal notranslate"><span class="pre">Categorical.as_ordered</span></code>, <code class="docutils literal notranslate"><span class="pre">Categorical.as_unordered</span></code> have been removed from pandas API on Spark.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">inplace</span></code> parameter from <code class="docutils literal notranslate"><span class="pre">CategoricalIndex.add_categories</span></code>, <code class="docutils literal notranslate"><span class="pre">CategoricalIndex.remove_categories</span></code>, <code class="docutils literal notranslate"><span class="pre">CategoricalIndex.remove_unused_categories</span></code>, <code class="docutils literal notranslate"><span class="pre">CategoricalIndex.set_categories</span></code>, <code class="docutils literal notranslate"><span class="pre">CategoricalIndex.rename_categories</span></code>, <code class="docutils literal notranslate"><span class="pre">CategoricalIndex.reorder_categories</span></code>, <code class="docutils literal notranslate"><span class="pre">CategoricalIndex.as_ordered</span></code>, <code class="docutils literal notranslate"><span class="pre">CategoricalIndex.as_unordered</span></code> have been removed from pandas API on Spark.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">closed</span></code> parameter from <code class="docutils literal notranslate"><span class="pre">ps.date_range</span></code> has been removed from pandas API on Spark.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">include_start</span></code> and <code class="docutils literal notranslate"><span class="pre">include_end</span></code> parameters from <code class="docutils literal notranslate"><span class="pre">DataFrame.between_time</span></code> have been removed from pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">inclusive</span></code> instead.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">include_start</span></code> and <code class="docutils literal notranslate"><span class="pre">include_end</span></code> parameters from <code class="docutils literal notranslate"><span class="pre">Series.between_time</span></code> have been removed from pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">inclusive</span></code> instead.</p></li> |
| <li><p>In Spark 4.0, the various datetime attributes of <code class="docutils literal notranslate"><span class="pre">DatetimeIndex</span></code> (<code class="docutils literal notranslate"><span class="pre">day</span></code>, <code class="docutils literal notranslate"><span class="pre">month</span></code>, <code class="docutils literal notranslate"><span class="pre">year</span></code> etc.) are now <code class="docutils literal notranslate"><span class="pre">int32</span></code> instead of <code class="docutils literal notranslate"><span class="pre">int64</span></code> from pandas API on Spark.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">sort_columns</span></code> parameter from <code class="docutils literal notranslate"><span class="pre">DataFrame.plot</span></code> and <cite>Series.plot`</cite> has been removed from pandas API on Spark.</p></li> |
| <li><p>In Spark 4.0, the default value of <code class="docutils literal notranslate"><span class="pre">regex</span></code> parameter for <code class="docutils literal notranslate"><span class="pre">Series.str.replace</span></code> has been changed from <code class="docutils literal notranslate"><span class="pre">True</span></code> to <code class="docutils literal notranslate"><span class="pre">False</span></code> from pandas API on Spark. Additionally, a single character <code class="docutils literal notranslate"><span class="pre">pat</span></code> with <code class="docutils literal notranslate"><span class="pre">regex=True</span></code> is now treated as a regular expression instead of a string literal.</p></li> |
| <li><p>In Spark 4.0, the resulting name from <code class="docutils literal notranslate"><span class="pre">value_counts</span></code> for all objects sets to <code class="docutils literal notranslate"><span class="pre">'count'</span></code> (or <code class="docutils literal notranslate"><span class="pre">'proportion'</span></code> if <code class="docutils literal notranslate"><span class="pre">normalize=True</span></code> was passed) from pandas API on Spark, and the index will be named after the original object.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">squeeze</span></code> parameter from <code class="docutils literal notranslate"><span class="pre">ps.read_csv</span></code> and <code class="docutils literal notranslate"><span class="pre">ps.read_excel</span></code> has been removed from pandas API on Spark.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">null_counts</span></code> parameter from <code class="docutils literal notranslate"><span class="pre">DataFrame.info</span></code> has been removed from pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">show_counts</span></code> instead.</p></li> |
| <li><p>In Spark 4.0, the result of <code class="docutils literal notranslate"><span class="pre">MultiIndex.append</span></code> does not keep the index names from pandas API on Spark.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">DataFrameGroupBy.agg</span></code> with lists respecting <code class="docutils literal notranslate"><span class="pre">as_index=False</span></code> from pandas API on Spark.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">DataFrame.stack</span></code> guarantees the order of existing columns instead of sorting them lexicographically from pandas API on Spark.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">True</span></code> or <code class="docutils literal notranslate"><span class="pre">False</span></code> to <code class="docutils literal notranslate"><span class="pre">inclusive</span></code> parameter from <code class="docutils literal notranslate"><span class="pre">Series.between</span></code> has been removed from pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">both</span></code> or <code class="docutils literal notranslate"><span class="pre">neither</span></code> instead respectively.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">Index.asi8</span></code> has been removed from pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">Index.astype</span></code> instead.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">Index.is_type_compatible</span></code> has been removed from pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">Index.isin</span></code> instead.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">col_space</span></code> parameter from <code class="docutils literal notranslate"><span class="pre">DataFrame.to_latex</span></code> and <code class="docutils literal notranslate"><span class="pre">Series.to_latex</span></code> has been removed from pandas API on Spark.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">DataFrame.to_spark_io</span></code> has been removed from pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">DataFrame.spark.to_spark_io</span></code> instead.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">Series.is_monotonic</span></code> and <code class="docutils literal notranslate"><span class="pre">Index.is_monotonic</span></code> have been removed from pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">Series.is_monotonic_increasing</span></code> or <code class="docutils literal notranslate"><span class="pre">Index.is_monotonic_increasing</span></code> instead respectively.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">DataFrame.get_dtype_counts</span></code> has been removed from pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">DataFrame.dtypes.value_counts()</span></code> instead.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">encoding</span></code> parameter from <code class="docutils literal notranslate"><span class="pre">DataFrame.to_excel</span></code> and <code class="docutils literal notranslate"><span class="pre">Series.to_excel</span></code> have been removed from pandas API on Spark.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">verbose</span></code> parameter from <code class="docutils literal notranslate"><span class="pre">DataFrame.to_excel</span></code> and <code class="docutils literal notranslate"><span class="pre">Series.to_excel</span></code> have been removed from pandas API on Spark.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">mangle_dupe_cols</span></code> parameter from <code class="docutils literal notranslate"><span class="pre">read_csv</span></code> has been removed from pandas API on Spark.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">DataFrameGroupBy.backfill</span></code> has been removed from pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">DataFrameGroupBy.bfill</span></code> instead.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">DataFrameGroupBy.pad</span></code> has been removed from pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">DataFrameGroupBy.ffill</span></code> instead.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">Index.is_all_dates</span></code> has been removed from pandas API on Spark.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">convert_float</span></code> parameter from <code class="docutils literal notranslate"><span class="pre">read_excel</span></code> has been removed from pandas API on Spark.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">mangle_dupe_cols</span></code> parameter from <code class="docutils literal notranslate"><span class="pre">read_excel</span></code> has been removed from pandas API on Spark.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">DataFrame.koalas</span></code> has been removed from pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">DataFrame.pandas_on_spark</span></code> instead.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">DataFrame.to_koalas</span></code> has been removed from PySpark, use <code class="docutils literal notranslate"><span class="pre">DataFrame.pandas_api</span></code> instead.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">DataFrame.to_pandas_on_spark</span></code> has been removed from PySpark, use <code class="docutils literal notranslate"><span class="pre">DataFrame.pandas_api</span></code> instead.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">DatatimeIndex.week</span></code> and <code class="docutils literal notranslate"><span class="pre">DatatimeIndex.weekofyear</span></code> have been removed from Pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">DatetimeIndex.isocalendar().week</span></code> instead.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">Series.dt.week</span></code> and <code class="docutils literal notranslate"><span class="pre">Series.dt.weekofyear</span></code> have been removed from Pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">Series.dt.isocalendar().week</span></code> instead.</p></li> |
| <li><p>In Spark 4.0, when applying <code class="docutils literal notranslate"><span class="pre">astype</span></code> to a decimal type object, the existing missing value is changed to <code class="docutils literal notranslate"><span class="pre">True</span></code> instead of <code class="docutils literal notranslate"><span class="pre">False</span></code> from Pandas API on Spark.</p></li> |
| <li><p>In Spark 4.0, <code class="docutils literal notranslate"><span class="pre">pyspark.testing.assertPandasOnSparkEqual</span></code> has been removed from Pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">pyspark.pandas.testing.assert_frame_equal</span></code> instead.</p></li> |
| <li><p>In Spark 4.0, the aliases <code class="docutils literal notranslate"><span class="pre">Y</span></code>, <code class="docutils literal notranslate"><span class="pre">M</span></code>, <code class="docutils literal notranslate"><span class="pre">H</span></code>, <code class="docutils literal notranslate"><span class="pre">T</span></code>, <code class="docutils literal notranslate"><span class="pre">S</span></code> have been deprecated from Pandas API on Spark, use <code class="docutils literal notranslate"><span class="pre">YE</span></code>, <code class="docutils literal notranslate"><span class="pre">ME</span></code>, <code class="docutils literal notranslate"><span class="pre">h</span></code>, <code class="docutils literal notranslate"><span class="pre">min</span></code>, <code class="docutils literal notranslate"><span class="pre">s</span></code> instead respectively.</p></li> |
| <li><p>In Spark 4.0, the schema of a map column is inferred by merging the schemas of all pairs in the map. To restore the previous behavior where the schema is only inferred from the first non-null pair, you can set <code class="docutils literal notranslate"><span class="pre">spark.sql.pyspark.legacy.inferMapTypeFromFirstPair.enabled</span></code> to <code class="docutils literal notranslate"><span class="pre">true</span></code>.</p></li> |
| <li><p>In Spark 4.0, <cite>compute.ops_on_diff_frames</cite> is on by default. To restore the previous behavior, set <cite>compute.ops_on_diff_frames</cite> to <cite>false</cite>.</p></li> |
| </ul> |
| </section> |
| <section id="upgrading-from-pyspark-3-3-to-3-4"> |
| <h2>Upgrading from PySpark 3.3 to 3.4<a class="headerlink" href="#upgrading-from-pyspark-3-3-to-3-4" title="Permalink to this headline">#</a></h2> |
| <ul class="simple"> |
| <li><p>In Spark 3.4, the schema of an array column is inferred by merging the schemas of all elements in the array. To restore the previous behavior where the schema is only inferred from the first element, you can set <code class="docutils literal notranslate"><span class="pre">spark.sql.pyspark.legacy.inferArrayTypeFromFirstElement.enabled</span></code> to <code class="docutils literal notranslate"><span class="pre">true</span></code>.</p></li> |
| <li><p>In Spark 3.4, if Pandas on Spark API <code class="docutils literal notranslate"><span class="pre">Groupby.apply</span></code>’s <code class="docutils literal notranslate"><span class="pre">func</span></code> parameter return type is not specified and <code class="docutils literal notranslate"><span class="pre">compute.shortcut_limit</span></code> is set to 0, the sampling rows will be set to 2 (ensure sampling rows always >= 2) to make sure infer schema is accurate.</p></li> |
| <li><p>In Spark 3.4, if Pandas on Spark API <code class="docutils literal notranslate"><span class="pre">Index.insert</span></code> is out of bounds, will raise IndexError with <code class="docutils literal notranslate"><span class="pre">index</span> <span class="pre">{}</span> <span class="pre">is</span> <span class="pre">out</span> <span class="pre">of</span> <span class="pre">bounds</span> <span class="pre">for</span> <span class="pre">axis</span> <span class="pre">0</span> <span class="pre">with</span> <span class="pre">size</span> <span class="pre">{}</span></code> to follow pandas 1.4 behavior.</p></li> |
| <li><p>In Spark 3.4, the series name will be preserved in Pandas on Spark API <code class="docutils literal notranslate"><span class="pre">Series.mode</span></code> to follow pandas 1.4 behavior.</p></li> |
| <li><p>In Spark 3.4, the Pandas on Spark API <code class="docutils literal notranslate"><span class="pre">Index.__setitem__</span></code> will first to check <code class="docutils literal notranslate"><span class="pre">value</span></code> type is <code class="docutils literal notranslate"><span class="pre">Column</span></code> type to avoid raising unexpected <code class="docutils literal notranslate"><span class="pre">ValueError</span></code> in <code class="docutils literal notranslate"><span class="pre">is_list_like</span></code> like <cite>Cannot convert column into bool: please use ‘&’ for ‘and’, ‘|’ for ‘or’, ‘~’ for ‘not’ when building DataFrame boolean expressions.</cite>.</p></li> |
| <li><p>In Spark 3.4, the Pandas on Spark API <code class="docutils literal notranslate"><span class="pre">astype('category')</span></code> will also refresh <code class="docutils literal notranslate"><span class="pre">categories.dtype</span></code> according to original data <code class="docutils literal notranslate"><span class="pre">dtype</span></code> to follow pandas 1.4 behavior.</p></li> |
| <li><p>In Spark 3.4, the Pandas on Spark API supports groupby positional indexing in <code class="docutils literal notranslate"><span class="pre">GroupBy.head</span></code> and <code class="docutils literal notranslate"><span class="pre">GroupBy.tail</span></code> to follow pandas 1.4. Negative arguments now work correctly and result in ranges relative to the end and start of each group, Previously, negative arguments returned empty frames.</p></li> |
| <li><p>In Spark 3.4, the infer schema process of <code class="docutils literal notranslate"><span class="pre">groupby.apply</span></code> in Pandas on Spark, will first infer the pandas type to ensure the accuracy of the pandas <code class="docutils literal notranslate"><span class="pre">dtype</span></code> as much as possible.</p></li> |
| <li><p>In Spark 3.4, the <code class="docutils literal notranslate"><span class="pre">Series.concat</span></code> sort parameter will be respected to follow pandas 1.4 behaviors.</p></li> |
| <li><p>In Spark 3.4, the <code class="docutils literal notranslate"><span class="pre">DataFrame.__setitem__</span></code> will make a copy and replace pre-existing arrays, which will NOT be over-written to follow pandas 1.4 behaviors.</p></li> |
| <li><p>In Spark 3.4, the <code class="docutils literal notranslate"><span class="pre">SparkSession.sql</span></code> and the Pandas on Spark API <code class="docutils literal notranslate"><span class="pre">sql</span></code> have got new parameter <code class="docutils literal notranslate"><span class="pre">args</span></code> which provides binding of named parameters to their SQL literals.</p></li> |
| <li><p>In Spark 3.4, Pandas API on Spark follows for the pandas 2.0, and some APIs were deprecated or removed in Spark 3.4 according to the changes made in pandas 2.0. Please refer to the [release notes of pandas](<a class="reference external" href="https://pandas.pydata.org/docs/dev/whatsnew/">https://pandas.pydata.org/docs/dev/whatsnew/</a>) for more details.</p></li> |
| <li><p>In Spark 3.4, the custom monkey-patch of <code class="docutils literal notranslate"><span class="pre">collections.namedtuple</span></code> was removed, and <code class="docutils literal notranslate"><span class="pre">cloudpickle</span></code> was used by default. To restore the previous behavior for any relevant pickling issue of <code class="docutils literal notranslate"><span class="pre">collections.namedtuple</span></code>, set <code class="docutils literal notranslate"><span class="pre">PYSPARK_ENABLE_NAMEDTUPLE_PATCH</span></code> environment variable to <code class="docutils literal notranslate"><span class="pre">1</span></code>.</p></li> |
| </ul> |
| </section> |
| <section id="upgrading-from-pyspark-3-2-to-3-3"> |
| <h2>Upgrading from PySpark 3.2 to 3.3<a class="headerlink" href="#upgrading-from-pyspark-3-2-to-3-3" title="Permalink to this headline">#</a></h2> |
| <ul class="simple"> |
| <li><p>In Spark 3.3, the <code class="docutils literal notranslate"><span class="pre">pyspark.pandas.sql</span></code> method follows [the standard Python string formatter](<a class="reference external" href="https://docs.python.org/3/library/string.html#format-string-syntax">https://docs.python.org/3/library/string.html#format-string-syntax</a>). To restore the previous behavior, set <code class="docutils literal notranslate"><span class="pre">PYSPARK_PANDAS_SQL_LEGACY</span></code> environment variable to <code class="docutils literal notranslate"><span class="pre">1</span></code>.</p></li> |
| <li><p>In Spark 3.3, the <code class="docutils literal notranslate"><span class="pre">drop</span></code> method of pandas API on Spark DataFrame supports dropping rows by <code class="docutils literal notranslate"><span class="pre">index</span></code>, and sets dropping by index instead of column by default.</p></li> |
| <li><p>In Spark 3.3, PySpark upgrades Pandas version, the new minimum required version changes from 0.23.2 to 1.0.5.</p></li> |
| <li><p>In Spark 3.3, the <code class="docutils literal notranslate"><span class="pre">repr</span></code> return values of SQL DataTypes have been changed to yield an object with the same value when passed to <code class="docutils literal notranslate"><span class="pre">eval</span></code>.</p></li> |
| </ul> |
| </section> |
| <section id="upgrading-from-pyspark-3-1-to-3-2"> |
| <h2>Upgrading from PySpark 3.1 to 3.2<a class="headerlink" href="#upgrading-from-pyspark-3-1-to-3-2" title="Permalink to this headline">#</a></h2> |
| <ul class="simple"> |
| <li><p>In Spark 3.2, the PySpark methods from sql, ml, spark_on_pandas modules raise the <code class="docutils literal notranslate"><span class="pre">TypeError</span></code> instead of <code class="docutils literal notranslate"><span class="pre">ValueError</span></code> when are applied to a param of inappropriate type.</p></li> |
| <li><p>In Spark 3.2, the traceback from Python UDFs, pandas UDFs and pandas function APIs are simplified by default without the traceback from the internal Python workers. In Spark 3.1 or earlier, the traceback from Python workers was printed out. To restore the behavior before Spark 3.2, you can set <code class="docutils literal notranslate"><span class="pre">spark.sql.execution.pyspark.udf.simplifiedTraceback.enabled</span></code> to <code class="docutils literal notranslate"><span class="pre">false</span></code>.</p></li> |
| <li><p>In Spark 3.2, pinned thread mode is enabled by default to map each Python thread to the corresponding JVM thread. Previously, |
| one JVM thread could be reused for multiple Python threads, which resulted in one JVM thread local being shared to multiple Python threads. |
| Also, note that now <code class="docutils literal notranslate"><span class="pre">pyspark.InheritableThread</span></code> or <code class="docutils literal notranslate"><span class="pre">pyspark.inheritable_thread_target</span></code> is recommended to use together for a Python thread |
| to properly inherit the inheritable attributes such as local properties in a JVM thread, and to avoid a potential resource leak issue. |
| To restore the behavior before Spark 3.2, you can set <code class="docutils literal notranslate"><span class="pre">PYSPARK_PIN_THREAD</span></code> environment variable to <code class="docutils literal notranslate"><span class="pre">false</span></code>.</p></li> |
| </ul> |
| </section> |
| <section id="upgrading-from-pyspark-2-4-to-3-0"> |
| <h2>Upgrading from PySpark 2.4 to 3.0<a class="headerlink" href="#upgrading-from-pyspark-2-4-to-3-0" title="Permalink to this headline">#</a></h2> |
| <ul> |
| <li><p>In Spark 3.0, PySpark requires a pandas version of 0.23.2 or higher to use pandas related functionality, such as <code class="docutils literal notranslate"><span class="pre">toPandas</span></code>, <code class="docutils literal notranslate"><span class="pre">createDataFrame</span></code> from pandas DataFrame, and so on.</p></li> |
| <li><p>In Spark 3.0, PySpark requires a PyArrow version of 0.12.1 or higher to use PyArrow related functionality, such as <code class="docutils literal notranslate"><span class="pre">pandas_udf</span></code>, <code class="docutils literal notranslate"><span class="pre">toPandas</span></code> and <code class="docutils literal notranslate"><span class="pre">createDataFrame</span></code> with “spark.sql.execution.arrow.enabled=true”, etc.</p></li> |
| <li><p>In PySpark, when creating a <code class="docutils literal notranslate"><span class="pre">SparkSession</span></code> with <code class="docutils literal notranslate"><span class="pre">SparkSession.builder.getOrCreate()</span></code>, if there is an existing <code class="docutils literal notranslate"><span class="pre">SparkContext</span></code>, the builder was trying to update the <code class="docutils literal notranslate"><span class="pre">SparkConf</span></code> of the existing <code class="docutils literal notranslate"><span class="pre">SparkContext</span></code> with configurations specified to the builder, but the <code class="docutils literal notranslate"><span class="pre">SparkContext</span></code> is shared by all <code class="docutils literal notranslate"><span class="pre">SparkSession</span></code> s, so we should not update them. In 3.0, the builder comes to not update the configurations. This is the same behavior as Java/Scala API in 2.3 and above. If you want to update them, you need to update them prior to creating a <code class="docutils literal notranslate"><span class="pre">SparkSession</span></code>.</p></li> |
| <li><p>In PySpark, when Arrow optimization is enabled, if Arrow version is higher than 0.11.0, Arrow can perform safe type conversion when converting pandas.Series to an Arrow array during serialization. Arrow raises errors when detecting unsafe type conversions like overflow. You enable it by setting <code class="docutils literal notranslate"><span class="pre">spark.sql.execution.pandas.convertToArrowArraySafely</span></code> to true. The default setting is false. PySpark behavior for Arrow versions is illustrated in the following table:</p> |
| <blockquote> |
| <div><table class="table"> |
| <colgroup> |
| <col style="width: 49%" /> |
| <col style="width: 20%" /> |
| <col style="width: 31%" /> |
| </colgroup> |
| <thead> |
| <tr class="row-odd"><th class="head"><p>PyArrow version</p></th> |
| <th class="head"><p>Integer overflow</p></th> |
| <th class="head"><p>Floating point truncation</p></th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr class="row-even"><td><p>0.11.0 and below</p></td> |
| <td><p>Raise error</p></td> |
| <td><p>Silently allows</p></td> |
| </tr> |
| <tr class="row-odd"><td><p>> 0.11.0, arrowSafeTypeConversion=false</p></td> |
| <td><p>Silent overflow</p></td> |
| <td><p>Silently allows</p></td> |
| </tr> |
| <tr class="row-even"><td><p>> 0.11.0, arrowSafeTypeConversion=true</p></td> |
| <td><p>Raise error</p></td> |
| <td><p>Raise error</p></td> |
| </tr> |
| </tbody> |
| </table> |
| </div></blockquote> |
| </li> |
| <li><p>In Spark 3.0, <code class="docutils literal notranslate"><span class="pre">createDataFrame(...,</span> <span class="pre">verifySchema=True)</span></code> validates LongType as well in PySpark. Previously, LongType was not verified and resulted in None in case the value overflows. To restore this behavior, verifySchema can be set to False to disable the validation.</p></li> |
| <li><p>As of Spark 3.0, <code class="docutils literal notranslate"><span class="pre">Row</span></code> field names are no longer sorted alphabetically when constructing with named arguments for Python versions 3.6 and above, and the order of fields will match that as entered. To enable sorted fields by default, as in Spark 2.4, set the environment variable <code class="docutils literal notranslate"><span class="pre">PYSPARK_ROW_FIELD_SORTING_ENABLED</span></code> to true for both executors and driver - this environment variable must be consistent on all executors and driver; otherwise, it may cause failures or incorrect answers. For Python versions less than 3.6, the field names will be sorted alphabetically as the only option.</p></li> |
| <li><p>In Spark 3.0, <code class="docutils literal notranslate"><span class="pre">pyspark.ml.param.shared.Has*</span></code> mixins do not provide any <code class="docutils literal notranslate"><span class="pre">set*(self,</span> <span class="pre">value)</span></code> setter methods anymore, use the respective <code class="docutils literal notranslate"><span class="pre">self.set(self.*,</span> <span class="pre">value)</span></code> instead. See <a class="reference external" href="https://issues.apache.org/jira/browse/SPARK-29093">SPARK-29093</a> for details.</p></li> |
| </ul> |
| </section> |
| <section id="upgrading-from-pyspark-2-3-to-2-4"> |
| <h2>Upgrading from PySpark 2.3 to 2.4<a class="headerlink" href="#upgrading-from-pyspark-2-3-to-2-4" title="Permalink to this headline">#</a></h2> |
| <ul class="simple"> |
| <li><p>In PySpark, when Arrow optimization is enabled, previously <code class="docutils literal notranslate"><span class="pre">toPandas</span></code> just failed when Arrow optimization is unable to be used whereas <code class="docutils literal notranslate"><span class="pre">createDataFrame</span></code> from Pandas DataFrame allowed the fallback to non-optimization. Now, both <code class="docutils literal notranslate"><span class="pre">toPandas</span></code> and <code class="docutils literal notranslate"><span class="pre">createDataFrame</span></code> from Pandas DataFrame allow the fallback by default, which can be switched off by <code class="docutils literal notranslate"><span class="pre">spark.sql.execution.arrow.fallback.enabled</span></code>.</p></li> |
| </ul> |
| </section> |
| <section id="upgrading-from-pyspark-2-3-0-to-2-3-1-and-above"> |
| <h2>Upgrading from PySpark 2.3.0 to 2.3.1 and above<a class="headerlink" href="#upgrading-from-pyspark-2-3-0-to-2-3-1-and-above" title="Permalink to this headline">#</a></h2> |
| <ul class="simple"> |
| <li><p>As of version 2.3.1 Arrow functionality, including <code class="docutils literal notranslate"><span class="pre">pandas_udf</span></code> and <code class="docutils literal notranslate"><span class="pre">toPandas()</span></code>/<code class="docutils literal notranslate"><span class="pre">createDataFrame()</span></code> with <code class="docutils literal notranslate"><span class="pre">spark.sql.execution.arrow.enabled</span></code> set to <code class="docutils literal notranslate"><span class="pre">True</span></code>, has been marked as experimental. These are still evolving and not currently recommended for use in production.</p></li> |
| </ul> |
| </section> |
| <section id="upgrading-from-pyspark-2-2-to-2-3"> |
| <h2>Upgrading from PySpark 2.2 to 2.3<a class="headerlink" href="#upgrading-from-pyspark-2-2-to-2-3" title="Permalink to this headline">#</a></h2> |
| <ul class="simple"> |
| <li><p>In PySpark, now we need Pandas 0.19.2 or upper if you want to use Pandas related functionalities, such as <code class="docutils literal notranslate"><span class="pre">toPandas</span></code>, <code class="docutils literal notranslate"><span class="pre">createDataFrame</span></code> from Pandas DataFrame, etc.</p></li> |
| <li><p>In PySpark, the behavior of timestamp values for Pandas related functionalities was changed to respect session timezone. If you want to use the old behavior, you need to set a configuration <code class="docutils literal notranslate"><span class="pre">spark.sql.execution.pandas.respectSessionTimeZone</span></code> to False. See <a class="reference external" href="https://issues.apache.org/jira/browse/SPARK-22395">SPARK-22395</a> for details.</p></li> |
| <li><p>In PySpark, <code class="docutils literal notranslate"><span class="pre">na.fill()</span></code> or <code class="docutils literal notranslate"><span class="pre">fillna</span></code> also accepts boolean and replaces nulls with booleans. In prior Spark versions, PySpark just ignores it and returns the original Dataset/DataFrame.</p></li> |
| <li><p>In PySpark, <code class="docutils literal notranslate"><span class="pre">df.replace</span></code> does not allow to omit value when <code class="docutils literal notranslate"><span class="pre">to_replace</span></code> is not a dictionary. Previously, value could be omitted in the other cases and had None by default, which is counterintuitive and error-prone.</p></li> |
| </ul> |
| </section> |
| <section id="upgrading-from-pyspark-1-4-to-1-5"> |
| <h2>Upgrading from PySpark 1.4 to 1.5<a class="headerlink" href="#upgrading-from-pyspark-1-4-to-1-5" title="Permalink to this headline">#</a></h2> |
| <ul class="simple"> |
| <li><p>Resolution of strings to columns in Python now supports using dots (.) to qualify the column or access nested values. For example <code class="docutils literal notranslate"><span class="pre">df['table.column.nestedField']</span></code>. However, this means that if your column name contains any dots you must now escape them using backticks (e.g., <code class="docutils literal notranslate"><span class="pre">table.`column.with.dots`.nested</span></code>).</p></li> |
| <li><p>DataFrame.withColumn method in PySpark supports adding a new column or replacing existing columns of the same name.</p></li> |
| </ul> |
| </section> |
| <section id="upgrading-from-pyspark-1-0-1-2-to-1-3"> |
| <h2>Upgrading from PySpark 1.0-1.2 to 1.3<a class="headerlink" href="#upgrading-from-pyspark-1-0-1-2-to-1-3" title="Permalink to this headline">#</a></h2> |
| <ul class="simple"> |
| <li><p>When using DataTypes in Python you will need to construct them (i.e. <code class="docutils literal notranslate"><span class="pre">StringType()</span></code>) instead of referencing a singleton.</p></li> |
| </ul> |
| </section> |
| </section> |
| |
| |
| </article> |
| |
| |
| |
| <footer class="bd-footer-article"> |
| |
| <div class="footer-article-items footer-article__inner"> |
| |
| <div class="footer-article-item"><!-- Previous / next buttons --> |
| <div class="prev-next-area"> |
| <a class="left-prev" |
| href="index.html" |
| title="previous page"> |
| <i class="fa-solid fa-angle-left"></i> |
| <div class="prev-next-info"> |
| <p class="prev-next-subtitle">previous</p> |
| <p class="prev-next-title">Migration Guides</p> |
| </div> |
| </a> |
| <a class="right-next" |
| href="koalas_to_pyspark.html" |
| title="next page"> |
| <div class="prev-next-info"> |
| <p class="prev-next-subtitle">next</p> |
| <p class="prev-next-title">Migrating from Koalas to pandas API on Spark</p> |
| </div> |
| <i class="fa-solid fa-angle-right"></i> |
| </a> |
| </div></div> |
| |
| </div> |
| |
| </footer> |
| |
| </div> |
| |
| |
| |
| <div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner"> |
| |
| <div class="sidebar-secondary-item"> |
| <div class="page-toc tocsection onthispage"> |
| <i class="fa-solid fa-list"></i> On this page |
| </div> |
| <nav class="bd-toc-nav page-toc"> |
| <ul class="visible nav section-nav flex-column"> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#upgrading-from-pyspark-3-5-to-4-0">Upgrading from PySpark 3.5 to 4.0</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#upgrading-from-pyspark-3-3-to-3-4">Upgrading from PySpark 3.3 to 3.4</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#upgrading-from-pyspark-3-2-to-3-3">Upgrading from PySpark 3.2 to 3.3</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#upgrading-from-pyspark-3-1-to-3-2">Upgrading from PySpark 3.1 to 3.2</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#upgrading-from-pyspark-2-4-to-3-0">Upgrading from PySpark 2.4 to 3.0</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#upgrading-from-pyspark-2-3-to-2-4">Upgrading from PySpark 2.3 to 2.4</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#upgrading-from-pyspark-2-3-0-to-2-3-1-and-above">Upgrading from PySpark 2.3.0 to 2.3.1 and above</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#upgrading-from-pyspark-2-2-to-2-3">Upgrading from PySpark 2.2 to 2.3</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#upgrading-from-pyspark-1-4-to-1-5">Upgrading from PySpark 1.4 to 1.5</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#upgrading-from-pyspark-1-0-1-2-to-1-3">Upgrading from PySpark 1.0-1.2 to 1.3</a></li> |
| </ul> |
| </nav></div> |
| |
| <div class="sidebar-secondary-item"> |
| <div class="tocsection sourcelink"> |
| <a href="../_sources/migration_guide/pyspark_upgrade.rst.txt"> |
| <i class="fa-solid fa-file-lines"></i> Show Source |
| </a> |
| </div> |
| </div> |
| |
| </div></div> |
| |
| |
| </div> |
| <footer class="bd-footer-content"> |
| |
| </footer> |
| |
| </main> |
| </div> |
| </div> |
| |
| <!-- Scripts loaded after <body> so the DOM is not blocked --> |
| <script src="../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script> |
| <script src="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script> |
| |
| <footer class="bd-footer"> |
| <div class="bd-footer__inner bd-page-width"> |
| |
| <div class="footer-items__start"> |
| |
| <div class="footer-item"><p class="copyright"> |
| Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. |
| </p></div> |
| |
| <div class="footer-item"> |
| <p class="sphinx-version"> |
| Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0. |
| <br/> |
| </p> |
| </div> |
| |
| </div> |
| |
| |
| <div class="footer-items__end"> |
| |
| <div class="footer-item"><p class="theme-version"> |
| Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3. |
| </p></div> |
| |
| </div> |
| |
| </div> |
| |
| </footer> |
| </body> |
| </html> |