blob: e34c32470de4e3d7b3a6c7e83f209fa02bb97102 [file] [log] [blame]
<!DOCTYPE html>
<html >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" />
<title>Python Package Management &#8212; PySpark 4.0.0-preview2 documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../_static/css/pyspark.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=e353d410970836974a52" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" />
<script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
<script src="../_static/jquery.js"></script>
<script src="../_static/underscore.js"></script>
<script src="../_static/doctools.js"></script>
<script src="../_static/clipboard.min.js"></script>
<script src="../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = 'user_guide/python_packaging';</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/user_guide/python_packaging.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Spark SQL" href="sql/index.html" />
<link rel="prev" title="User Guides" href="index.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a class="skip-link" href="#main-content">Skip to main content</a>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<nav class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../index.html">
<img src="https://spark.apache.org/images/spark-logo.png" class="logo__image only-light" alt="Logo image"/>
<script>document.write(`<img src="https://spark.apache.org/images/spark-logo-rev.svg" class="logo__image only-dark" alt="Logo image"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item current active">
<a class="nav-link nav-internal" href="index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview2
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "user_guide/python_packaging.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<label class="sidebar-toggle secondary-toggle" for="__secondary">
<span class="fa-solid fa-outdent"></span>
</label>
</div>
</nav>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../index.html">
Overview
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../getting_started/index.html">
Getting Started
</a>
</li>
<li class="nav-item current active">
<a class="nav-link nav-internal" href="index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview2
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "user_guide/python_packaging.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item"><nav class="bd-docs-nav bd-links"
aria-label="Section Navigation">
<p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p>
<div class="bd-toc-item navbar-nav"><ul class="current nav bd-sidenav">
<li class="toctree-l1 current active"><a class="current reference internal" href="#">Python Package Management</a></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="sql/index.html">Spark SQL</a><input class="toctree-checkbox" id="toctree-checkbox-1" name="toctree-checkbox-1" type="checkbox"/><label class="toctree-toggle" for="toctree-checkbox-1"><i class="fa-solid fa-chevron-down"></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="sql/arrow_pandas.html">Apache Arrow in PySpark</a></li>
<li class="toctree-l2"><a class="reference internal" href="sql/python_udtf.html">Python User-defined Table Functions (UDTFs)</a></li>
<li class="toctree-l2"><a class="reference internal" href="sql/python_data_source.html">Python Data Source API</a></li>
<li class="toctree-l2"><a class="reference internal" href="sql/type_conversions.html">Python to Spark Type Conversions</a></li>
</ul>
</li>
<li class="toctree-l1 has-children"><a class="reference internal" href="pandas_on_spark/index.html">Pandas API on Spark</a><input class="toctree-checkbox" id="toctree-checkbox-2" name="toctree-checkbox-2" type="checkbox"/><label class="toctree-toggle" for="toctree-checkbox-2"><i class="fa-solid fa-chevron-down"></i></label><ul>
<li class="toctree-l2"><a class="reference internal" href="pandas_on_spark/options.html">Options and settings</a></li>
<li class="toctree-l2"><a class="reference internal" href="pandas_on_spark/pandas_pyspark.html">From/to pandas and PySpark DataFrames</a></li>
<li class="toctree-l2"><a class="reference internal" href="pandas_on_spark/transform_apply.html">Transform and apply a function</a></li>
<li class="toctree-l2"><a class="reference internal" href="pandas_on_spark/types.html">Type Support in Pandas API on Spark</a></li>
<li class="toctree-l2"><a class="reference internal" href="pandas_on_spark/typehints.html">Type Hints in Pandas API on Spark</a></li>
<li class="toctree-l2"><a class="reference internal" href="pandas_on_spark/from_to_dbms.html">From/to other DBMSes</a></li>
<li class="toctree-l2"><a class="reference internal" href="pandas_on_spark/best_practices.html">Best Practices</a></li>
<li class="toctree-l2"><a class="reference internal" href="pandas_on_spark/supported_pandas_api.html">Supported pandas API</a></li>
<li class="toctree-l2"><a class="reference internal" href="pandas_on_spark/faq.html">FAQ</a></li>
</ul>
</li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumbs">
<ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb">
<li class="breadcrumb-item breadcrumb-home">
<a href="../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="index.html" class="nav-link">User Guides</a></li>
<li class="breadcrumb-item active" aria-current="page">Python Package Management</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article" role="main">
<section id="python-package-management">
<h1>Python Package Management<a class="headerlink" href="#python-package-management" title="Permalink to this headline">#</a></h1>
<p>When you want to run your PySpark application on a cluster such as YARN, Kubernetes, etc., you need to make
sure that your code and all used libraries are available on the executors.</p>
<p>As an example, let’s say you may want to run the <a class="reference internal" href="sql/arrow_pandas.html#series-to-scalar"><span class="std std-ref">Pandas UDF examples</span></a>.
As it uses pyarrow as an underlying implementation we need to make sure to have pyarrow installed on each executor
on the cluster. Otherwise you may get errors such as <code class="docutils literal notranslate"><span class="pre">ModuleNotFoundError:</span> <span class="pre">No</span> <span class="pre">module</span> <span class="pre">named</span> <span class="pre">'pyarrow'</span></code>.</p>
<p>Here is the script <code class="docutils literal notranslate"><span class="pre">app.py</span></code> from the previous example that will be executed on the cluster:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
<span class="kn">from</span> <span class="nn">pyspark.sql.functions</span> <span class="kn">import</span> <span class="n">pandas_udf</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span>
<span class="k">def</span> <span class="nf">main</span><span class="p">(</span><span class="n">spark</span><span class="p">):</span>
<span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="n">createDataFrame</span><span class="p">(</span>
<span class="p">[(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">1.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mf">2.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">3.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">5.0</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mf">10.0</span><span class="p">)],</span>
<span class="p">(</span><span class="s2">&quot;id&quot;</span><span class="p">,</span> <span class="s2">&quot;v&quot;</span><span class="p">))</span>
<span class="nd">@pandas_udf</span><span class="p">(</span><span class="s2">&quot;double&quot;</span><span class="p">)</span>
<span class="k">def</span> <span class="nf">mean_udf</span><span class="p">(</span><span class="n">v</span><span class="p">:</span> <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="nb">float</span><span class="p">:</span>
<span class="k">return</span> <span class="n">v</span><span class="o">.</span><span class="n">mean</span><span class="p">()</span>
<span class="nb">print</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">groupby</span><span class="p">(</span><span class="s2">&quot;id&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">agg</span><span class="p">(</span><span class="n">mean_udf</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s1">&#39;v&#39;</span><span class="p">]))</span><span class="o">.</span><span class="n">collect</span><span class="p">())</span>
<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s2">&quot;__main__&quot;</span><span class="p">:</span>
<span class="n">main</span><span class="p">(</span><span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">())</span>
</pre></div>
</div>
<p>There are multiple ways to manage Python dependencies in the cluster:</p>
<ul class="simple">
<li><p>Using PySpark Native Features</p></li>
<li><p>Using Conda</p></li>
<li><p>Using Virtualenv</p></li>
<li><p>Using PEX</p></li>
</ul>
<section id="using-pyspark-native-features">
<h2>Using PySpark Native Features<a class="headerlink" href="#using-pyspark-native-features" title="Permalink to this headline">#</a></h2>
<p>PySpark allows to upload Python files (<code class="docutils literal notranslate"><span class="pre">.py</span></code>), zipped Python packages (<code class="docutils literal notranslate"><span class="pre">.zip</span></code>), and Egg files (<code class="docutils literal notranslate"><span class="pre">.egg</span></code>)
to the executors by one of the following:</p>
<ul class="simple">
<li><p>Setting the configuration setting <code class="docutils literal notranslate"><span class="pre">spark.submit.pyFiles</span></code></p></li>
<li><p>Setting <code class="docutils literal notranslate"><span class="pre">--py-files</span></code> option in Spark scripts</p></li>
<li><p>Directly calling <a class="reference internal" href="../reference/api/pyspark.SparkContext.addPyFile.html#pyspark.SparkContext.addPyFile" title="pyspark.SparkContext.addPyFile"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pyspark.SparkContext.addPyFile()</span></code></a> in applications</p></li>
</ul>
<p>This is a straightforward method to ship additional custom Python code to the cluster. You can just add individual files or zip whole
packages and upload them. Using <a class="reference internal" href="../reference/api/pyspark.SparkContext.addPyFile.html#pyspark.SparkContext.addPyFile" title="pyspark.SparkContext.addPyFile"><code class="xref py py-meth docutils literal notranslate"><span class="pre">pyspark.SparkContext.addPyFile()</span></code></a> allows you to upload code even after having started your job.</p>
<p>However, it does not allow to add packages built as <a class="reference external" href="https://www.python.org/dev/peps/pep-0427/">Wheels</a> and therefore
does not allow to include dependencies with native code.</p>
</section>
<section id="using-conda">
<h2>Using Conda<a class="headerlink" href="#using-conda" title="Permalink to this headline">#</a></h2>
<p><a class="reference external" href="https://docs.conda.io/en/latest/">Conda</a> is one of the most widely-used Python package management systems. PySpark users can directly
use a Conda environment to ship their third-party Python packages by leveraging
<a class="reference external" href="https://conda.github.io/conda-pack/spark.html">conda-pack</a> which is a command line tool creating
relocatable Conda environments.</p>
<p>The example below creates a Conda environment to use on both the driver and executor and packs
it into an archive file. This archive file captures the Conda environment for Python and stores
both Python interpreter and all its relevant dependencies.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>conda<span class="w"> </span>create<span class="w"> </span>-y<span class="w"> </span>-n<span class="w"> </span>pyspark_conda_env<span class="w"> </span>-c<span class="w"> </span>conda-forge<span class="w"> </span>pyarrow<span class="w"> </span>pandas<span class="w"> </span>conda-pack
conda<span class="w"> </span>activate<span class="w"> </span>pyspark_conda_env
conda<span class="w"> </span>pack<span class="w"> </span>-f<span class="w"> </span>-o<span class="w"> </span>pyspark_conda_env.tar.gz
</pre></div>
</div>
<p>After that, you can ship it together with scripts or in the code by using the <code class="docutils literal notranslate"><span class="pre">--archives</span></code> option
or <code class="docutils literal notranslate"><span class="pre">spark.archives</span></code> configuration (<code class="docutils literal notranslate"><span class="pre">spark.yarn.dist.archives</span></code> in YARN). It automatically unpacks the archive on executors.</p>
<p>In the case of a <code class="docutils literal notranslate"><span class="pre">spark-submit</span></code> script, you can use it as follows:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_DRIVER_PYTHON</span><span class="o">=</span>python<span class="w"> </span><span class="c1"># Do not set in cluster modes.</span>
<span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_PYTHON</span><span class="o">=</span>./environment/bin/python
spark-submit<span class="w"> </span>--archives<span class="w"> </span>pyspark_conda_env.tar.gz#environment<span class="w"> </span>app.py
</pre></div>
</div>
<p>Note that <code class="docutils literal notranslate"><span class="pre">PYSPARK_DRIVER_PYTHON</span></code> above should not be set for cluster modes in YARN or Kubernetes.</p>
<p>If you’re on a regular Python shell or notebook, you can try it as shown below:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">os</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span>
<span class="kn">from</span> <span class="nn">app</span> <span class="kn">import</span> <span class="n">main</span>
<span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s1">&#39;PYSPARK_PYTHON&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;./environment/bin/python&quot;</span>
<span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">config</span><span class="p">(</span>
<span class="s2">&quot;spark.archives&quot;</span><span class="p">,</span> <span class="c1"># &#39;spark.yarn.dist.archives&#39; in YARN.</span>
<span class="s2">&quot;pyspark_conda_env.tar.gz#environment&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="n">main</span><span class="p">(</span><span class="n">spark</span><span class="p">)</span>
</pre></div>
</div>
<p>For a pyspark shell:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_DRIVER_PYTHON</span><span class="o">=</span>python
<span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_PYTHON</span><span class="o">=</span>./environment/bin/python
pyspark<span class="w"> </span>--archives<span class="w"> </span>pyspark_conda_env.tar.gz#environment
</pre></div>
</div>
</section>
<section id="using-virtualenv">
<h2>Using Virtualenv<a class="headerlink" href="#using-virtualenv" title="Permalink to this headline">#</a></h2>
<p><a class="reference external" href="https://virtualenv.pypa.io/en/latest/">Virtualenv</a> is a Python tool to create isolated Python environments.
Since Python 3.3, a subset of its features has been integrated into Python as a standard library under
the <a class="reference external" href="https://docs.python.org/3/library/venv.html">venv</a> module. PySpark users can use virtualenv to manage
Python dependencies in their clusters by using <a class="reference external" href="https://jcristharif.com/venv-pack/index.html">venv-pack</a>
in a similar way as conda-pack.</p>
<p>A virtual environment to use on both driver and executor can be created as demonstrated below.
It packs the current virtual environment to an archive file, and it contains both Python interpreter and the dependencies.
However, it requires all nodes in a cluster to have the same Python interpreter installed because
<a class="reference external" href="https://github.com/jcrist/venv-pack/issues/5">venv-pack packs Python interpreter as a symbolic link</a>.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>-m<span class="w"> </span>venv<span class="w"> </span>pyspark_venv
<span class="nb">source</span><span class="w"> </span>pyspark_venv/bin/activate
pip<span class="w"> </span>install<span class="w"> </span>pyarrow<span class="w"> </span>pandas<span class="w"> </span>venv-pack
venv-pack<span class="w"> </span>-o<span class="w"> </span>pyspark_venv.tar.gz
</pre></div>
</div>
<p>You can directly pass/unpack the archive file and enable the environment on executors by leveraging
the <code class="docutils literal notranslate"><span class="pre">--archives</span></code> option or <code class="docutils literal notranslate"><span class="pre">spark.archives</span></code> configuration (<code class="docutils literal notranslate"><span class="pre">spark.yarn.dist.archives</span></code> in YARN).</p>
<p>For <code class="docutils literal notranslate"><span class="pre">spark-submit</span></code>, you can use it by running the command as follows. Also, notice that
<code class="docutils literal notranslate"><span class="pre">PYSPARK_DRIVER_PYTHON</span></code> has to be unset in Kubernetes or YARN cluster modes.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_DRIVER_PYTHON</span><span class="o">=</span>python<span class="w"> </span><span class="c1"># Do not set in cluster modes.</span>
<span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_PYTHON</span><span class="o">=</span>./environment/bin/python
spark-submit<span class="w"> </span>--archives<span class="w"> </span>pyspark_venv.tar.gz#environment<span class="w"> </span>app.py
</pre></div>
</div>
<p>For regular Python shells or notebooks:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>import<span class="w"> </span>os
from<span class="w"> </span>pyspark.sql<span class="w"> </span>import<span class="w"> </span>SparkSession
from<span class="w"> </span>app<span class="w"> </span>import<span class="w"> </span>main
os.environ<span class="o">[</span><span class="s1">&#39;PYSPARK_PYTHON&#39;</span><span class="o">]</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="s2">&quot;./environment/bin/python&quot;</span>
<span class="nv">spark</span><span class="w"> </span><span class="o">=</span><span class="w"> </span>SparkSession.builder.config<span class="o">(</span>
<span class="w"> </span><span class="s2">&quot;spark.archives&quot;</span>,<span class="w"> </span><span class="c1"># &#39;spark.yarn.dist.archives&#39; in YARN.</span>
<span class="w"> </span><span class="s2">&quot;pyspark_venv.tar.gz#environment&quot;</span><span class="o">)</span>.getOrCreate<span class="o">()</span>
main<span class="o">(</span>spark<span class="o">)</span>
</pre></div>
</div>
<p>In the case of a pyspark shell:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_DRIVER_PYTHON</span><span class="o">=</span>python
<span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_PYTHON</span><span class="o">=</span>./environment/bin/python
pyspark<span class="w"> </span>--archives<span class="w"> </span>pyspark_venv.tar.gz#environment
</pre></div>
</div>
</section>
<section id="using-pex">
<h2>Using PEX<a class="headerlink" href="#using-pex" title="Permalink to this headline">#</a></h2>
<p>PySpark can also use <a class="reference external" href="https://github.com/pantsbuild/pex">PEX</a> to ship the Python packages
together. PEX is a tool that creates a self-contained Python environment. This is similar
to Conda or virtualenv, but a <code class="docutils literal notranslate"><span class="pre">.pex</span></code> file is executable by itself.</p>
<p>The following example creates a <code class="docutils literal notranslate"><span class="pre">.pex</span></code> file for the driver and executor to use.
The file contains the Python dependencies specified with the <code class="docutils literal notranslate"><span class="pre">pex</span></code> command.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip<span class="w"> </span>install<span class="w"> </span>pyarrow<span class="w"> </span>pandas<span class="w"> </span>pex
pex<span class="w"> </span>pyspark<span class="w"> </span>pyarrow<span class="w"> </span>pandas<span class="w"> </span>-o<span class="w"> </span>pyspark_pex_env.pex
</pre></div>
</div>
<p>This file behaves similarly with a regular Python interpreter.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>./pyspark_pex_env.pex<span class="w"> </span>-c<span class="w"> </span><span class="s2">&quot;import pandas; print(pandas.__version__)&quot;</span>
<span class="m">1</span>.1.5
</pre></div>
</div>
<p>However, <code class="docutils literal notranslate"><span class="pre">.pex</span></code> file does not include a Python interpreter itself under the hood so all
nodes in a cluster should have the same Python interpreter installed.</p>
<p>In order to transfer and use the <code class="docutils literal notranslate"><span class="pre">.pex</span></code> file in a cluster, you should ship it via the
<code class="docutils literal notranslate"><span class="pre">spark.files</span></code> configuration (<code class="docutils literal notranslate"><span class="pre">spark.yarn.dist.files</span></code> in YARN) or <code class="docutils literal notranslate"><span class="pre">--files</span></code> option because they are regular files instead
of directories or archive files.</p>
<p>For application submission, you run the commands as shown below.
Note that <code class="docutils literal notranslate"><span class="pre">PYSPARK_DRIVER_PYTHON</span></code> should not be set for cluster modes in YARN or Kubernetes.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_DRIVER_PYTHON</span><span class="o">=</span>python<span class="w"> </span><span class="c1"># Do not set in cluster modes.</span>
<span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_PYTHON</span><span class="o">=</span>./pyspark_pex_env.pex
spark-submit<span class="w"> </span>--files<span class="w"> </span>pyspark_pex_env.pex<span class="w"> </span>app.py
</pre></div>
</div>
<p>For regular Python shells or notebooks:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">os</span>
<span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span>
<span class="kn">from</span> <span class="nn">app</span> <span class="kn">import</span> <span class="n">main</span>
<span class="n">os</span><span class="o">.</span><span class="n">environ</span><span class="p">[</span><span class="s1">&#39;PYSPARK_PYTHON&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="s2">&quot;./pyspark_pex_env.pex&quot;</span>
<span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="o">.</span><span class="n">builder</span><span class="o">.</span><span class="n">config</span><span class="p">(</span>
<span class="s2">&quot;spark.files&quot;</span><span class="p">,</span> <span class="c1"># &#39;spark.yarn.dist.files&#39; in YARN.</span>
<span class="s2">&quot;pyspark_pex_env.pex&quot;</span><span class="p">)</span><span class="o">.</span><span class="n">getOrCreate</span><span class="p">()</span>
<span class="n">main</span><span class="p">(</span><span class="n">spark</span><span class="p">)</span>
</pre></div>
</div>
<p>For the interactive pyspark shell, the commands are almost the same:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_DRIVER_PYTHON</span><span class="o">=</span>python
<span class="nb">export</span><span class="w"> </span><span class="nv">PYSPARK_PYTHON</span><span class="o">=</span>./pyspark_pex_env.pex
pyspark<span class="w"> </span>--files<span class="w"> </span>pyspark_pex_env.pex
</pre></div>
</div>
<p>An end-to-end Docker example for deploying a standalone PySpark with <code class="docutils literal notranslate"><span class="pre">SparkSession.builder</span></code> and PEX
can be found <a class="reference external" href="https://github.com/criteo/cluster-pack/blob/master/examples/spark-with-S3/README.md">here</a>
- it uses cluster-pack, a library on top of PEX that automatizes the intermediate step of having
to create &amp; upload the PEX manually.</p>
</section>
</section>
</article>
<footer class="bd-footer-article">
<div class="footer-article-items footer-article__inner">
<div class="footer-article-item"><!-- Previous / next buttons -->
<div class="prev-next-area">
<a class="left-prev"
href="index.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">User Guides</p>
</div>
</a>
<a class="right-next"
href="sql/index.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">Spark SQL</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div></div>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> On this page
</div>
<nav class="bd-toc-nav page-toc">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#using-pyspark-native-features">Using PySpark Native Features</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#using-conda">Using Conda</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#using-virtualenv">Using Virtualenv</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#using-pex">Using PEX</a></li>
</ul>
</nav></div>
<div class="sidebar-secondary-item">
<div class="tocsection sourcelink">
<a href="../_sources/user_guide/python_packaging.rst.txt">
<i class="fa-solid fa-file-lines"></i> Show Source
</a>
</div>
</div>
</div></div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item"><p class="copyright">
Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
</p></div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item"><p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3.
</p></div>
</div>
</div>
</footer>
</body>
</html>