| |
| |
| <!DOCTYPE html> |
| |
| |
| <html > |
| |
| <head> |
| <meta charset="utf-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" /> |
| |
| <title>Python Package Management — PySpark 4.0.0-preview2 documentation</title> |
| |
| |
| |
| <script data-cfasync="false"> |
| document.documentElement.dataset.mode = localStorage.getItem("mode") || ""; |
| document.documentElement.dataset.theme = localStorage.getItem("theme") || "light"; |
| </script> |
| |
| <!-- Loaded before other Sphinx assets --> |
| <link href="../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| |
| |
| <link href="../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" /> |
| |
| <link rel="stylesheet" type="text/css" href="../_static/pygments.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/copybutton.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/css/pyspark.css" /> |
| |
| <!-- Pre-loaded scripts that we'll load fully later --> |
| <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=e353d410970836974a52" /> |
| <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" /> |
| |
| <script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script> |
| <script src="../_static/jquery.js"></script> |
| <script src="../_static/underscore.js"></script> |
| <script src="../_static/doctools.js"></script> |
| <script src="../_static/clipboard.min.js"></script> |
| <script src="../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script>DOCUMENTATION_OPTIONS.pagename = 'user_guide/python_packaging';</script> |
| <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/user_guide/python_packaging.html" /> |
| <link rel="search" title="Search" href="../search.html" /> |
| <link rel="next" title="Spark SQL" href="sql/index.html" /> |
| <link rel="prev" title="User Guides" href="index.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="None"> |
| |
| |
| <!-- Matomo --> |
| <script type="text/javascript"> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| </head> |
| |
| |
| <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode=""> |
| |
| |
| |
| <a class="skip-link" href="#main-content">Skip to main content</a> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__primary" |
| id="__primary"/> |
| <label class="overlay overlay-primary" for="__primary"></label> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__secondary" |
| id="__secondary"/> |
| <label class="overlay overlay-secondary" for="__secondary"></label> |
| |
| <div class="search-button__wrapper"> |
| <div class="search-button__overlay"></div> |
| <div class="search-button__search-container"> |
| <form class="bd-search d-flex align-items-center" |
| action="../search.html" |
| method="get"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| <input type="search" |
| class="form-control" |
| name="q" |
| id="search-input" |
| placeholder="Search the docs ..." |
| aria-label="Search the docs ..." |
| autocomplete="off" |
| autocorrect="off" |
| autocapitalize="off" |
| spellcheck="false"/> |
| <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span> |
| </form></div> |
| </div> |
| |
| <nav class="bd-header navbar navbar-expand-lg bd-navbar"> |
| <div class="bd-header__inner bd-page-width"> |
| <label class="sidebar-toggle primary-toggle" for="__primary"> |
| <span class="fa-solid fa-bars"></span> |
| </label> |
| |
| <div class="navbar-header-items__start"> |
| |
| <div class="navbar-item"> |
| |
| |
| <a class="navbar-brand logo" href="../index.html"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <img src="https://spark.apache.org/images/spark-logo.png" class="logo__image only-light" alt="Logo image"/> |
| <script>document.write(`<img src="https://spark.apache.org/images/spark-logo-rev.svg" class="logo__image only-dark" alt="Logo image"/>`);</script> |
| |
| |
| </a></div> |
| |
| </div> |
| |
| |
| <div class="col-lg-9 navbar-header-items"> |
| |
| <div class="me-auto navbar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item current active"> |
| <a class="nav-link nav-internal" href="index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| <div class="navbar-header-items__end"> |
| |
| <div class="navbar-item navbar-persistent--container"> |
| |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview2 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "user_guide/python_packaging.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="navbar-persistent--mobile"> |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| |
| <label class="sidebar-toggle secondary-toggle" for="__secondary"> |
| <span class="fa-solid fa-outdent"></span> |
| </label> |
| |
| </div> |
| |
| </nav> |
| |
| <div class="bd-container"> |
| <div class="bd-container__inner bd-page-width"> |
| |
| <div class="bd-sidebar-primary bd-sidebar"> |
| |
| |
| |
| <div class="sidebar-header-items sidebar-primary__section"> |
| |
| |
| <div class="sidebar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../getting_started/index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item current active"> |
| <a class="nav-link nav-internal" href="index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| |
| <div class="sidebar-header-items__end"> |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview2 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "user_guide/python_packaging.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| <div class="sidebar-primary-items__start sidebar-primary__section"> |
| <div class="sidebar-primary-item"><nav class="bd-docs-nav bd-links" |
| aria-label="Section Navigation"> |
| <p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p> |
| <div class="bd-toc-item navbar-nav"><ul class="current nav bd-sidenav"> |
| <li class="toctree-l1 current active"><a class="current reference internal" href="#">Python Package Management</a></li> |
| <li class="toctree-l1 has-children"><a class="reference internal" href="sql/index.html">Spark SQL</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" type="checkbox"/><label class="toctree-toggle" for="toctree-checkbox-1"><i class="fa-solid fa-chevron-down"></i></label><ul> |
| <li class="toctree-l2"><a class="reference internal" href="sql/arrow_pandas.html">Apache Arrow in PySpark</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="sql/python_udtf.html">Python User-defined Table Functions (UDTFs)</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="sql/python_data_source.html">Python Data Source API</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="sql/type_conversions.html">Python to Spark Type Conversions</a></li> |
| </ul> |
| </li> |
| <li class="toctree-l1 has-children"><a class="reference internal" href="pandas_on_spark/index.html">Pandas API on Spark</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" type="checkbox"/><label class="toctree-toggle" for="toctree-checkbox-2"><i class="fa-solid fa-chevron-down"></i></label><ul> |
| <li class="toctree-l2"><a class="reference internal" href="pandas_on_spark/options.html">Options and settings</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="pandas_on_spark/pandas_pyspark.html">From/to pandas and PySpark DataFrames</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="pandas_on_spark/transform_apply.html">Transform and apply a function</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="pandas_on_spark/types.html">Type Support in Pandas API on Spark</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="pandas_on_spark/typehints.html">Type Hints in Pandas API on Spark</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="pandas_on_spark/from_to_dbms.html">From/to other DBMSes</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="pandas_on_spark/best_practices.html">Best Practices</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="pandas_on_spark/supported_pandas_api.html">Supported pandas API</a></li> |
| <li class="toctree-l2"><a class="reference internal" href="pandas_on_spark/faq.html">FAQ</a></li> |
| </ul> |
| </li> |
| </ul> |
| </div> |
| </nav></div> |
| </div> |
| |
| |
| <div class="sidebar-primary-items__end sidebar-primary__section"> |
| </div> |
| |
| <div id="rtd-footer-container"></div> |
| |
| |
| </div> |
| |
| <main id="main-content" class="bd-main"> |
| |
| |
| <div class="bd-content"> |
| <div class="bd-article-container"> |
| |
| <div class="bd-header-article"> |
| <div class="header-article-items header-article__inner"> |
| |
| <div class="header-article-items__start"> |
| |
| <div class="header-article-item"> |
| |
| |
| |
| <nav aria-label="Breadcrumbs"> |
| <ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb"> |
| |
| <li class="breadcrumb-item breadcrumb-home"> |
| <a href="../index.html" class="nav-link" aria-label="Home"> |
| <i class="fa-solid fa-home"></i> |
| </a> |
| </li> |
| |
| <li class="breadcrumb-item"><a href="index.html" class="nav-link">User Guides</a></li> |
| |
| <li class="breadcrumb-item active" aria-current="page">Python Package Management</li> |
| </ul> |
| </nav> |
| </div> |
| |
| </div> |
| |
| |
| </div> |
| </div> |
| |
| |
| |
| |
| <div id="searchbox"></div> |
| <article class="bd-article" role="main"> |
| |
| <section id="python-package-management"> |
| <h1>Python Package Management<a class="headerlink" href="#python-package-management" title="Permalink to this headline">#</a></h1> |
| <p>When you want to run your PySpark application on a cluster such as YARN, Kubernetes, etc., you need to make |
| sure that your code and all used libraries are available on the executors.</p> |
| <p>As an example, let’s say you may want to run the <a class="reference internal" href="sql/arrow_pandas.html#series-to-scalar"><span class="std std-ref">Pandas UDF examples</span></a>. |
| As it uses pyarrow as an underlying implementation we need to make sure to have pyarrow installed on each executor |
| on the cluster. Otherwise you may get errors such as <code class="docutils literal notranslate"><span class="pre">ModuleNotFoundError:</span> <span class="pre">No</span> <span class="pre">module</span> <span class="pre">named</span> <span class="pre">'pyarrow'</span></code>.</p> |
| <p>Here is the script <code class="docutils literal notranslate"><span class="pre">app.py</span></code> from the previous example that will be executed on the cluster:</p> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">pandas_udf</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| |
| <span class="k">def</span> <span class="nf">main</span><span class="p">(</span><span class="n">spark</span><span class="p">):</span> |
| <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span> |
| <span class="p">[(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">10.0</span><span class="p">)],</span> |
| <span class="p">(</span><span class="s2">"id"</span><span class="p">,</span> <span class="s2">"v"</span><span class="p">))</span> |
| |
| <span class="nd">@pandas_udf</span><span class="p">(</span><span class="s2">"double"</span><span class="p">)</span> |
| <span class="k">def</span> <span class="nf">mean_udf</span><span class="p">(</span><span class="n">v</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-></span> <span class="nb">float</span><span class="p">:</span> |
| <span class="k">return</span> <span class="n">v</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span> |
| |
| <span class="nb">print</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s2">"id"</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="n">mean_udf</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s1">'v'</span><span class="p">]))</span><span class="o">.</span><span class="n">collect</span><span class="p">())</span> |
| |
| |
| <span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">"__main__"</span><span class="p">:</span> |
| <span class="n">main</span><span class="p">(</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">())</span> |
| </pre></div> |
| </div> |
| <p>There are multiple ways to manage Python dependencies in the cluster:</p> |
| <ul class="simple"> |
| <li><p>Using PySpark Native Features</p></li> |
| <li><p>Using Conda</p></li> |
| <li><p>Using Virtualenv</p></li> |
| <li><p>Using PEX</p></li> |
| </ul> |
| <section id="using-pyspark-native-features"> |
| <h2>Using PySpark Native Features<a class="headerlink" href="#using-pyspark-native-features" title="Permalink to this headline">#</a></h2> |
| <p>PySpark allows to upload Python files (<code class="docutils literal notranslate"><span class="pre">.py</span></code>), zipped Python packages (<code class="docutils literal notranslate"><span class="pre">.zip</span></code>), and Egg files (<code class="docutils literal notranslate"><span class="pre">.egg</span></code>) |
| to the executors by one of the following:</p> |
| <ul class="simple"> |
| <li><p>Setting the configuration setting <code class="docutils literal notranslate"><span class="pre">spark.submit.pyFiles</span></code></p></li> |
| <li><p>Setting <code class="docutils literal notranslate"><span class="pre">--py-files</span></code> option in Spark scripts</p></li> |
| <li><p>Directly calling <a class="reference internal" href="../reference/api/pyspark.SparkContext.addPyFile.html#pyspark.SparkContext.addPyFile" title="pyspark.SparkContext.addPyFile"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pyspark.SparkContext.addPyFile()</span></code></a> in applications</p></li> |
| </ul> |
| <p>This is a straightforward method to ship additional custom Python code to the cluster. You can just add individual files or zip whole |
| packages and upload them. Using <a class="reference internal" href="../reference/api/pyspark.SparkContext.addPyFile.html#pyspark.SparkContext.addPyFile" title="pyspark.SparkContext.addPyFile"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pyspark.SparkContext.addPyFile()</span></code></a> allows you to upload code even after having started your job.</p> |
| <p>However, it does not allow to add packages built as <a class="reference external" href="https://www.python.org/dev/peps/pep-0427/">Wheels</a> and therefore |
| does not allow to include dependencies with native code.</p> |
| </section> |
| <section id="using-conda"> |
| <h2>Using Conda<a class="headerlink" href="#using-conda" title="Permalink to this headline">#</a></h2> |
| <p><a class="reference external" href="https://docs.conda.io/en/latest/">Conda</a> is one of the most widely-used Python package management systems. PySpark users can directly |
| use a Conda environment to ship their third-party Python packages by leveraging |
| <a class="reference external" href="https://conda.github.io/conda-pack/spark.html">conda-pack</a> which is a command line tool creating |
| relocatable Conda environments.</p> |
| <p>The example below creates a Conda environment to use on both the driver and executor and packs |
| it into an archive file. This archive file captures the Conda environment for Python and stores |
| both Python interpreter and all its relevant dependencies.</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>conda<span class="w"> </span>create<span class="w"> </span>-y<span class="w"> </span>-n<span class="w"> </span>pyspark_conda_env<span class="w"> </span>-c<span class="w"> </span>conda-forge<span class="w"> </span>pyarrow<span class="w"> </span>pandas<span class="w"> </span>conda-pack |
| conda<span class="w"> </span>activate<span class="w"> </span>pyspark_conda_env |
| conda<span class="w"> </span>pack<span class="w"> </span>-f<span class="w"> </span>-o<span class="w"> </span>pyspark_conda_env.tar.gz |
| </pre></div> |
| </div> |
| <p>After that, you can ship it together with scripts or in the code by using the <code class="docutils literal notranslate"><span class="pre">--archives</span></code> option |
| or <code class="docutils literal notranslate"><span class="pre">spark.archives</span></code> configuration (<code class="docutils literal notranslate"><span class="pre">spark.yarn.dist.archives</span></code> in YARN). It automatically unpacks the archive on executors.</p> |
| <p>In the case of a <code class="docutils literal notranslate"><span class="pre">spark-submit</span></code> script, you can use it as follows:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_DRIVER_PYTHON</span><span class="o">=</span>python<span class="w"> </span><span class="c1"># Do not set in cluster modes.</span> |
| <span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_PYTHON</span><span class="o">=</span>./environment/bin/python |
| spark-submit<span class="w"> </span>--archives<span class="w"> </span>pyspark_conda_env.tar.gz#environment<span class="w"> </span>app.py |
| </pre></div> |
| </div> |
| <p>Note that <code class="docutils literal notranslate"><span class="pre">PYSPARK_DRIVER_PYTHON</span></code> above should not be set for cluster modes in YARN or Kubernetes.</p> |
| <p>If you’re on a regular Python shell or notebook, you can try it as shown below:</p> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">os</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| <span class="kn">from</span> <span class="nn">app</span> <span class="kn">import</span> <span class="n">main</span> |
| |
| <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s1">'PYSPARK_PYTHON'</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"./environment/bin/python"</span> |
| <span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">config</span><span class="p">(</span> |
| <span class="s2">"spark.archives"</span><span class="p">,</span> <span class="c1"># 'spark.yarn.dist.archives' in YARN.</span> |
| <span class="s2">"pyspark_conda_env.tar.gz#environment"</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| <span class="n">main</span><span class="p">(</span><span class="n">spark</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| <p>For a pyspark shell:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_DRIVER_PYTHON</span><span class="o">=</span>python |
| <span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_PYTHON</span><span class="o">=</span>./environment/bin/python |
| pyspark<span class="w"> </span>--archives<span class="w"> </span>pyspark_conda_env.tar.gz#environment |
| </pre></div> |
| </div> |
| </section> |
| <section id="using-virtualenv"> |
| <h2>Using Virtualenv<a class="headerlink" href="#using-virtualenv" title="Permalink to this headline">#</a></h2> |
| <p><a class="reference external" href="https://virtualenv.pypa.io/en/latest/">Virtualenv</a> is a Python tool to create isolated Python environments. |
| Since Python 3.3, a subset of its features has been integrated into Python as a standard library under |
| the <a class="reference external" href="https://docs.python.org/3/library/venv.html">venv</a> module. PySpark users can use virtualenv to manage |
| Python dependencies in their clusters by using <a class="reference external" href="https://jcristharif.com/venv-pack/index.html">venv-pack</a> |
| in a similar way as conda-pack.</p> |
| <p>A virtual environment to use on both driver and executor can be created as demonstrated below. |
| It packs the current virtual environment to an archive file, and it contains both Python interpreter and the dependencies. |
| However, it requires all nodes in a cluster to have the same Python interpreter installed because |
| <a class="reference external" href="https://github.com/jcrist/venv-pack/issues/5">venv-pack packs Python interpreter as a symbolic link</a>.</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>-m<span class="w"> </span>venv<span class="w"> </span>pyspark_venv |
| <span class="nb">source</span><span class="w"> </span>pyspark_venv/bin/activate |
| pip<span class="w"> </span>install<span class="w"> </span>pyarrow<span class="w"> </span>pandas<span class="w"> </span>venv-pack |
| venv-pack<span class="w"> </span>-o<span class="w"> </span>pyspark_venv.tar.gz |
| </pre></div> |
| </div> |
| <p>You can directly pass/unpack the archive file and enable the environment on executors by leveraging |
| the <code class="docutils literal notranslate"><span class="pre">--archives</span></code> option or <code class="docutils literal notranslate"><span class="pre">spark.archives</span></code> configuration (<code class="docutils literal notranslate"><span class="pre">spark.yarn.dist.archives</span></code> in YARN).</p> |
| <p>For <code class="docutils literal notranslate"><span class="pre">spark-submit</span></code>, you can use it by running the command as follows. Also, notice that |
| <code class="docutils literal notranslate"><span class="pre">PYSPARK_DRIVER_PYTHON</span></code> has to be unset in Kubernetes or YARN cluster modes.</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_DRIVER_PYTHON</span><span class="o">=</span>python<span class="w"> </span><span class="c1"># Do not set in cluster modes.</span> |
| <span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_PYTHON</span><span class="o">=</span>./environment/bin/python |
| spark-submit<span class="w"> </span>--archives<span class="w"> </span>pyspark_venv.tar.gz#environment<span class="w"> </span>app.py |
| </pre></div> |
| </div> |
| <p>For regular Python shells or notebooks:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>import<span class="w"> </span>os |
| from<span class="w"> </span>pyspark.sql<span class="w"> </span>import<span class="w"> </span>SparkSession |
| from<span class="w"> </span>app<span class="w"> </span>import<span class="w"> </span>main |
| |
| os.environ<span class="o">[</span><span class="s1">'PYSPARK_PYTHON'</span><span class="o">]</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">"./environment/bin/python"</span> |
| <span class="nv">spark</span><span class="w"> </span><span class="o">=</span><span class="w"> </span>SparkSession.builder.config<span class="o">(</span> |
| <span class="w"> </span><span class="s2">"spark.archives"</span>,<span class="w"> </span><span class="c1"># 'spark.yarn.dist.archives' in YARN.</span> |
| <span class="w"> </span><span class="s2">"pyspark_venv.tar.gz#environment"</span><span class="o">)</span>.getOrCreate<span class="o">()</span> |
| main<span class="o">(</span>spark<span class="o">)</span> |
| </pre></div> |
| </div> |
| <p>In the case of a pyspark shell:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_DRIVER_PYTHON</span><span class="o">=</span>python |
| <span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_PYTHON</span><span class="o">=</span>./environment/bin/python |
| pyspark<span class="w"> </span>--archives<span class="w"> </span>pyspark_venv.tar.gz#environment |
| </pre></div> |
| </div> |
| </section> |
| <section id="using-pex"> |
| <h2>Using PEX<a class="headerlink" href="#using-pex" title="Permalink to this headline">#</a></h2> |
| <p>PySpark can also use <a class="reference external" href="https://github.com/pantsbuild/pex">PEX</a> to ship the Python packages |
| together. PEX is a tool that creates a self-contained Python environment. This is similar |
| to Conda or virtualenv, but a <code class="docutils literal notranslate"><span class="pre">.pex</span></code> file is executable by itself.</p> |
| <p>The following example creates a <code class="docutils literal notranslate"><span class="pre">.pex</span></code> file for the driver and executor to use. |
| The file contains the Python dependencies specified with the <code class="docutils literal notranslate"><span class="pre">pex</span></code> command.</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip<span class="w"> </span>install<span class="w"> </span>pyarrow<span class="w"> </span>pandas<span class="w"> </span>pex |
| pex<span class="w"> </span>pyspark<span class="w"> </span>pyarrow<span class="w"> </span>pandas<span class="w"> </span>-o<span class="w"> </span>pyspark_pex_env.pex |
| </pre></div> |
| </div> |
| <p>This file behaves similarly with a regular Python interpreter.</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>./pyspark_pex_env.pex<span class="w"> </span>-c<span class="w"> </span><span class="s2">"import pandas; print(pandas.__version__)"</span> |
| <span class="m">1</span>.1.5 |
| </pre></div> |
| </div> |
| <p>However, <code class="docutils literal notranslate"><span class="pre">.pex</span></code> file does not include a Python interpreter itself under the hood so all |
| nodes in a cluster should have the same Python interpreter installed.</p> |
| <p>In order to transfer and use the <code class="docutils literal notranslate"><span class="pre">.pex</span></code> file in a cluster, you should ship it via the |
| <code class="docutils literal notranslate"><span class="pre">spark.files</span></code> configuration (<code class="docutils literal notranslate"><span class="pre">spark.yarn.dist.files</span></code> in YARN) or <code class="docutils literal notranslate"><span class="pre">--files</span></code> option because they are regular files instead |
| of directories or archive files.</p> |
| <p>For application submission, you run the commands as shown below. |
| Note that <code class="docutils literal notranslate"><span class="pre">PYSPARK_DRIVER_PYTHON</span></code> should not be set for cluster modes in YARN or Kubernetes.</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_DRIVER_PYTHON</span><span class="o">=</span>python<span class="w"> </span><span class="c1"># Do not set in cluster modes.</span> |
| <span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_PYTHON</span><span class="o">=</span>./pyspark_pex_env.pex |
| spark-submit<span class="w"> </span>--files<span class="w"> </span>pyspark_pex_env.pex<span class="w"> </span>app.py |
| </pre></div> |
| </div> |
| <p>For regular Python shells or notebooks:</p> |
| <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">os</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| <span class="kn">from</span> <span class="nn">app</span> <span class="kn">import</span> <span class="n">main</span> |
| |
| <span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s1">'PYSPARK_PYTHON'</span><span class="p">]</span> <span class="o">=</span> <span class="s2">"./pyspark_pex_env.pex"</span> |
| <span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">config</span><span class="p">(</span> |
| <span class="s2">"spark.files"</span><span class="p">,</span> <span class="c1"># 'spark.yarn.dist.files' in YARN.</span> |
| <span class="s2">"pyspark_pex_env.pex"</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| <span class="n">main</span><span class="p">(</span><span class="n">spark</span><span class="p">)</span> |
| </pre></div> |
| </div> |
| <p>For the interactive pyspark shell, the commands are almost the same:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_DRIVER_PYTHON</span><span class="o">=</span>python |
| <span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_PYTHON</span><span class="o">=</span>./pyspark_pex_env.pex |
| pyspark<span class="w"> </span>--files<span class="w"> </span>pyspark_pex_env.pex |
| </pre></div> |
| </div> |
| <p>An end-to-end Docker example for deploying a standalone PySpark with <code class="docutils literal notranslate"><span class="pre">SparkSession.builder</span></code> and PEX |
| can be found <a class="reference external" href="https://github.com/criteo/cluster-pack/blob/master/examples/spark-with-S3/README.md">here</a> |
| - it uses cluster-pack, a library on top of PEX that automatizes the intermediate step of having |
| to create & upload the PEX manually.</p> |
| </section> |
| </section> |
| |
| |
| </article> |
| |
| |
| |
| <footer class="bd-footer-article"> |
| |
| <div class="footer-article-items footer-article__inner"> |
| |
| <div class="footer-article-item"><!-- Previous / next buttons --> |
| <div class="prev-next-area"> |
| <a class="left-prev" |
| href="index.html" |
| title="previous page"> |
| <i class="fa-solid fa-angle-left"></i> |
| <div class="prev-next-info"> |
| <p class="prev-next-subtitle">previous</p> |
| <p class="prev-next-title">User Guides</p> |
| </div> |
| </a> |
| <a class="right-next" |
| href="sql/index.html" |
| title="next page"> |
| <div class="prev-next-info"> |
| <p class="prev-next-subtitle">next</p> |
| <p class="prev-next-title">Spark SQL</p> |
| </div> |
| <i class="fa-solid fa-angle-right"></i> |
| </a> |
| </div></div> |
| |
| </div> |
| |
| </footer> |
| |
| </div> |
| |
| |
| |
| <div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner"> |
| |
| <div class="sidebar-secondary-item"> |
| <div class="page-toc tocsection onthispage"> |
| <i class="fa-solid fa-list"></i> On this page |
| </div> |
| <nav class="bd-toc-nav page-toc"> |
| <ul class="visible nav section-nav flex-column"> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#using-pyspark-native-features">Using PySpark Native Features</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#using-conda">Using Conda</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#using-virtualenv">Using Virtualenv</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#using-pex">Using PEX</a></li> |
| </ul> |
| </nav></div> |
| |
| <div class="sidebar-secondary-item"> |
| <div class="tocsection sourcelink"> |
| <a href="../_sources/user_guide/python_packaging.rst.txt"> |
| <i class="fa-solid fa-file-lines"></i> Show Source |
| </a> |
| </div> |
| </div> |
| |
| </div></div> |
| |
| |
| </div> |
| <footer class="bd-footer-content"> |
| |
| </footer> |
| |
| </main> |
| </div> |
| </div> |
| |
| <!-- Scripts loaded after <body> so the DOM is not blocked --> |
| <script src="../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script> |
| <script src="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script> |
| |
| <footer class="bd-footer"> |
| <div class="bd-footer__inner bd-page-width"> |
| |
| <div class="footer-items__start"> |
| |
| <div class="footer-item"><p class="copyright"> |
| Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. |
| </p></div> |
| |
| <div class="footer-item"> |
| <p class="sphinx-version"> |
| Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0. |
| <br/> |
| </p> |
| </div> |
| |
| </div> |
| |
| |
| <div class="footer-items__end"> |
| |
| <div class="footer-item"><p class="theme-version"> |
| Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3. |
| </p></div> |
| |
| </div> |
| |
| </div> |
| |
| </footer> |
| </body> |
| </html> |