blob: 17b555f70d66c3eabc10a8bad8a5e14feb1e9cbe [file] [log] [blame]
<!DOCTYPE html>
<html >
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" />
<title>Installation &#8212; PySpark 4.0.0-preview1 documentation</title>
<script data-cfasync="false">
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
</script>
<!-- Loaded before other Sphinx assets -->
<link href="../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" />
<link href="../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" />
<link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" />
<link rel="stylesheet" type="text/css" href="../_static/pygments.css" />
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../_static/css/pyspark.css" />
<!-- Pre-loaded scripts that we'll load fully later -->
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=e353d410970836974a52" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" />
<script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
<script src="../_static/jquery.js"></script>
<script src="../_static/underscore.js"></script>
<script src="../_static/doctools.js"></script>
<script src="../_static/clipboard.min.js"></script>
<script src="../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script>DOCUMENTATION_OPTIONS.pagename = 'getting_started/install';</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/getting_started/install.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Quickstart: DataFrame" href="quickstart_df.html" />
<link rel="prev" title="Getting Started" href="index.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
<a class="skip-link" href="#main-content">Skip to main content</a>
<input type="checkbox"
class="sidebar-toggle"
name="__primary"
id="__primary"/>
<label class="overlay overlay-primary" for="__primary"></label>
<input type="checkbox"
class="sidebar-toggle"
name="__secondary"
id="__secondary"/>
<label class="overlay overlay-secondary" for="__secondary"></label>
<div class="search-button__wrapper">
<div class="search-button__overlay"></div>
<div class="search-button__search-container">
<form class="bd-search d-flex align-items-center"
action="../search.html"
method="get">
<i class="fa-solid fa-magnifying-glass"></i>
<input type="search"
class="form-control"
name="q"
id="search-input"
placeholder="Search the docs ..."
aria-label="Search the docs ..."
autocomplete="off"
autocorrect="off"
autocapitalize="off"
spellcheck="false"/>
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form></div>
</div>
<nav class="bd-header navbar navbar-expand-lg bd-navbar">
<div class="bd-header__inner bd-page-width">
<label class="sidebar-toggle primary-toggle" for="__primary">
<span class="fa-solid fa-bars"></span>
</label>
<div class="navbar-header-items__start">
<div class="navbar-item">
<a class="navbar-brand logo" href="../index.html">
<img src="../_static/spark-logo-light.png" class="logo__image only-light" alt="Logo image"/>
<script>document.write(`<img src="../_static/spark-logo-dark.png" class="logo__image only-dark" alt="Logo image"/>`);</script>
</a></div>
</div>
<div class="col-lg-9 navbar-header-items">
<div class="me-auto navbar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../index.html">
Overview
</a>
</li>
<li class="nav-item current active">
<a class="nav-link nav-internal" href="index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="navbar-header-items__end">
<div class="navbar-item navbar-persistent--container">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "getting_started/install.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="navbar-persistent--mobile">
<script>
document.write(`
<button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
<i class="fa-solid fa-magnifying-glass"></i>
</button>
`);
</script>
</div>
<label class="sidebar-toggle secondary-toggle" for="__secondary">
<span class="fa-solid fa-outdent"></span>
</label>
</div>
</nav>
<div class="bd-container">
<div class="bd-container__inner bd-page-width">
<div class="bd-sidebar-primary bd-sidebar">
<div class="sidebar-header-items sidebar-primary__section">
<div class="sidebar-header-items__center">
<div class="navbar-item"><nav class="navbar-nav">
<p class="sidebar-header-items__title"
role="heading"
aria-level="1"
aria-label="Site Navigation">
Site Navigation
</p>
<ul class="bd-navbar-elements navbar-nav">
<li class="nav-item">
<a class="nav-link nav-internal" href="../index.html">
Overview
</a>
</li>
<li class="nav-item current active">
<a class="nav-link nav-internal" href="index.html">
Getting Started
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../user_guide/index.html">
User Guides
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../reference/index.html">
API Reference
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../development/index.html">
Development
</a>
</li>
<li class="nav-item">
<a class="nav-link nav-internal" href="../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</nav></div>
</div>
<div class="sidebar-header-items__end">
<div class="navbar-item"><!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
4.0.0-preview1
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "getting_started/install.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script></div>
<div class="navbar-item">
<script>
document.write(`
<button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
<span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span>
<span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span>
<span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span>
</button>
`);
</script></div>
<div class="navbar-item"><ul class="navbar-icon-links navbar-nav"
aria-label="Icon Links">
<li class="nav-item">
<a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span>
<label class="sr-only">GitHub</label></a>
</li>
<li class="nav-item">
<a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span>
<label class="sr-only">PyPI</label></a>
</li>
</ul></div>
</div>
</div>
<div class="sidebar-primary-items__start sidebar-primary__section">
<div class="sidebar-primary-item"><nav class="bd-docs-nav bd-links"
aria-label="Section Navigation">
<p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p>
<div class="bd-toc-item navbar-nav"><ul class="current nav bd-sidenav">
<li class="toctree-l1 current active"><a class="current reference internal" href="#">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="quickstart_df.html">Quickstart: DataFrame</a></li>
<li class="toctree-l1"><a class="reference internal" href="quickstart_connect.html">Quickstart: Spark Connect</a></li>
<li class="toctree-l1"><a class="reference internal" href="quickstart_ps.html">Quickstart: Pandas API on Spark</a></li>
<li class="toctree-l1"><a class="reference internal" href="testing_pyspark.html">Testing PySpark</a></li>
</ul>
</div>
</nav></div>
</div>
<div class="sidebar-primary-items__end sidebar-primary__section">
</div>
<div id="rtd-footer-container"></div>
</div>
<main id="main-content" class="bd-main">
<div class="bd-content">
<div class="bd-article-container">
<div class="bd-header-article">
<div class="header-article-items header-article__inner">
<div class="header-article-items__start">
<div class="header-article-item">
<nav aria-label="Breadcrumbs">
<ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb">
<li class="breadcrumb-item breadcrumb-home">
<a href="../index.html" class="nav-link" aria-label="Home">
<i class="fa-solid fa-home"></i>
</a>
</li>
<li class="breadcrumb-item"><a href="index.html" class="nav-link">Getting Started</a></li>
<li class="breadcrumb-item active" aria-current="page">Installation</li>
</ul>
</nav>
</div>
</div>
</div>
</div>
<div id="searchbox"></div>
<article class="bd-article" role="main">
<section id="installation">
<h1>Installation<a class="headerlink" href="#installation" title="Permalink to this headline">#</a></h1>
<p>PySpark is included in the official releases of Spark available in the <a class="reference external" href="https://spark.apache.org/downloads.html">Apache Spark website</a>.
For Python users, PySpark also provides <code class="docutils literal notranslate"><span class="pre">pip</span></code> installation from PyPI. This is usually for local usage or as
a client to connect to a cluster instead of setting up a cluster itself.</p>
<p>This page includes instructions for installing PySpark by using pip, Conda, downloading manually,
and building from the source.</p>
<section id="python-versions-supported">
<h2>Python Versions Supported<a class="headerlink" href="#python-versions-supported" title="Permalink to this headline">#</a></h2>
<p>Python 3.9 and above.</p>
</section>
<section id="using-pypi">
<h2>Using PyPI<a class="headerlink" href="#using-pypi" title="Permalink to this headline">#</a></h2>
<p>PySpark installation using <a class="reference external" href="https://pypi.org/project/pyspark/">PyPI (pyspark)</a> is as follows:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip<span class="w"> </span>install<span class="w"> </span>pyspark
</pre></div>
</div>
<p>If you want to install extra dependencies for a specific component, you can install it as below:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Spark SQL</span>
pip<span class="w"> </span>install<span class="w"> </span>pyspark<span class="o">[</span>sql<span class="o">]</span>
<span class="c1"># pandas API on Spark</span>
pip<span class="w"> </span>install<span class="w"> </span>pyspark<span class="o">[</span>pandas_on_spark<span class="o">]</span><span class="w"> </span>plotly<span class="w"> </span><span class="c1"># to plot your data, you can install plotly together.</span>
<span class="c1"># Spark Connect</span>
pip<span class="w"> </span>install<span class="w"> </span>pyspark<span class="o">[</span>connect<span class="o">]</span>
</pre></div>
</div>
<p>See <a class="reference internal" href="#optional-dependencies"><span class="std std-ref">Optional dependencies</span></a> for more detail about extra dependencies.</p>
<p>For PySpark with/without a specific Hadoop version, you can install it by using <code class="docutils literal notranslate"><span class="pre">PYSPARK_HADOOP_VERSION</span></code> environment variables as below:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">PYSPARK_HADOOP_VERSION</span><span class="o">=</span><span class="m">3</span><span class="w"> </span>pip<span class="w"> </span>install<span class="w"> </span>pyspark
</pre></div>
</div>
<p>The default distribution uses Hadoop 3.3 and Hive 2.3. If users specify different versions of Hadoop, the pip installation automatically
downloads a different version and uses it in PySpark. Downloading it can take a while depending on
the network and the mirror chosen. <code class="docutils literal notranslate"><span class="pre">PYSPARK_RELEASE_MIRROR</span></code> can be set to manually choose the mirror for faster downloading.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">PYSPARK_RELEASE_MIRROR</span><span class="o">=</span>http://mirror.apache-kr.org<span class="w"> </span><span class="nv">PYSPARK_HADOOP_VERSION</span><span class="o">=</span><span class="m">3</span><span class="w"> </span>pip<span class="w"> </span>install
</pre></div>
</div>
<p>It is recommended to use <code class="docutils literal notranslate"><span class="pre">-v</span></code> option in <code class="docutils literal notranslate"><span class="pre">pip</span></code> to track the installation and download status.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">PYSPARK_HADOOP_VERSION</span><span class="o">=</span><span class="m">3</span><span class="w"> </span>pip<span class="w"> </span>install<span class="w"> </span>pyspark<span class="w"> </span>-v
</pre></div>
</div>
<p>Supported values in <code class="docutils literal notranslate"><span class="pre">PYSPARK_HADOOP_VERSION</span></code> are:</p>
<ul class="simple">
<li><p><code class="docutils literal notranslate"><span class="pre">without</span></code>: Spark pre-built with user-provided Apache Hadoop</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">3</span></code>: Spark pre-built for Apache Hadoop 3.3 and later (default)</p></li>
</ul>
<p>Note that this installation of PySpark with/without a specific Hadoop version is experimental. It can change or be removed between minor releases.</p>
<section id="python-spark-connect-client">
<h3>Python Spark Connect Client<a class="headerlink" href="#python-spark-connect-client" title="Permalink to this headline">#</a></h3>
<p>The Python Spark Connect client is a pure Python library that does not rely on any non-Python dependencies such as jars and JRE in your environment.
To install the Python Spark Connect client via <a class="reference external" href="https://pypi.org/project/pyspark-connect/">PyPI (pyspark-connect)</a>, execute the following command:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip<span class="w"> </span>install<span class="w"> </span>pyspark-connect
</pre></div>
</div>
<p>See also <a class="reference external" href="quickstart_connect.html">Quickstart: Spark Connect</a> for how to use it.</p>
</section>
</section>
<section id="using-conda">
<h2>Using Conda<a class="headerlink" href="#using-conda" title="Permalink to this headline">#</a></h2>
<p>Conda is an open-source package management and environment management system (developed by
<a class="reference external" href="https://www.anaconda.com/">Anaconda</a>), which is best installed through
<a class="reference external" href="https://docs.conda.io/en/latest/miniconda.html">Miniconda</a> or <a class="reference external" href="https://github.com/conda-forge/miniforge/">Miniforge</a>.
The tool is both cross-platform and language agnostic, and in practice, conda can replace both
<a class="reference external" href="https://pip.pypa.io/en/latest/">pip</a> and <a class="reference external" href="https://virtualenv.pypa.io/en/latest/">virtualenv</a>.</p>
<p>Conda uses so-called channels to distribute packages, and together with the default channels by
Anaconda itself, the most important channel is <a class="reference external" href="https://conda-forge.org/">conda-forge</a>, which
is the community-driven packaging effort that is the most extensive &amp; the most current (and also
serves as the upstream for the Anaconda channels in most cases).</p>
<p>To create a new conda environment from your terminal and activate it, proceed as shown below:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>conda<span class="w"> </span>create<span class="w"> </span>-n<span class="w"> </span>pyspark_env
conda<span class="w"> </span>activate<span class="w"> </span>pyspark_env
</pre></div>
</div>
<p>After activating the environment, use the following command to install pyspark,
a python version of your choice, as well as other packages you want to use in
the same session as pyspark (you can install in several steps too).</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>conda<span class="w"> </span>install<span class="w"> </span>-c<span class="w"> </span>conda-forge<span class="w"> </span>pyspark<span class="w"> </span><span class="c1"># can also add &quot;python=3.9 some_package [etc.]&quot; here</span>
</pre></div>
</div>
<p>Note that <a class="reference external" href="https://anaconda.org/conda-forge/pyspark">PySpark for conda</a> is maintained
separately by the community; while new versions generally get packaged quickly, the
availability through conda(-forge) is not directly in sync with the PySpark release cycle.</p>
<p>While using pip in a conda environment is technically feasible (with the same command as
<a class="reference internal" href="#using-pypi"><span class="std std-ref">above</span></a>), this approach is <a class="reference external" href="https://www.anaconda.com/blog/using-pip-in-a-conda-environment/">discouraged</a>,
because pip does not interoperate with conda.</p>
<p>For a short summary about useful conda commands, see their
<a class="reference external" href="https://docs.conda.io/projects/conda/en/latest/user-guide/cheatsheet.html">cheat sheet</a>.</p>
</section>
<section id="manually-downloading">
<h2>Manually Downloading<a class="headerlink" href="#manually-downloading" title="Permalink to this headline">#</a></h2>
<p>PySpark is included in the distributions available at the <a class="reference external" href="https://spark.apache.org/downloads.html">Apache Spark website</a>.
You can download a distribution you want from the site. After that, uncompress the tar file into the directory where you want
to install Spark, for example, as below:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>tar<span class="w"> </span>xzvf<span class="w"> </span>spark-<span class="se">\ </span><span class="p">|</span>release<span class="p">|</span><span class="se">\-</span>bin-hadoop3.tgz
</pre></div>
</div>
<p>Ensure the <code class="docutils literal notranslate"><span class="pre">SPARK_HOME</span></code> environment variable points to the directory where the tar file has been extracted.
Update <code class="docutils literal notranslate"><span class="pre">PYTHONPATH</span></code> environment variable such that it can find the PySpark and Py4J under <code class="docutils literal notranslate"><span class="pre">SPARK_HOME/python/lib</span></code>.
One example of doing this is shown below:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">cd</span><span class="w"> </span>spark-<span class="se">\ </span><span class="p">|</span>release<span class="p">|</span><span class="se">\-</span>bin-hadoop3
<span class="nb">export</span><span class="w"> </span><span class="nv">SPARK_HOME</span><span class="o">=</span><span class="sb">`</span><span class="nb">pwd</span><span class="sb">`</span>
<span class="nb">export</span><span class="w"> </span><span class="nv">PYTHONPATH</span><span class="o">=</span><span class="k">$(</span><span class="nv">ZIPS</span><span class="o">=(</span><span class="s2">&quot;</span><span class="nv">$SPARK_HOME</span><span class="s2">&quot;</span>/python/lib/*.zip<span class="k">)</span><span class="p">;</span><span class="w"> </span><span class="nv">IFS</span><span class="o">=</span>:<span class="p">;</span><span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">&quot;</span><span class="si">${</span><span class="nv">ZIPS</span><span class="p">[*]</span><span class="si">}</span><span class="s2">&quot;</span><span class="o">)</span>:<span class="nv">$PYTHONPATH</span>
</pre></div>
</div>
</section>
<section id="installing-from-source">
<h2>Installing from Source<a class="headerlink" href="#installing-from-source" title="Permalink to this headline">#</a></h2>
<p>To install PySpark from source, refer to <a class="reference external" href="https://spark.apache.org/docs/4.0.0-preview1/building-spark.html">Building Spark</a>.</p>
</section>
<section id="dependencies">
<h2>Dependencies<a class="headerlink" href="#dependencies" title="Permalink to this headline">#</a></h2>
<section id="required-dependencies">
<h3>Required dependencies<a class="headerlink" href="#required-dependencies" title="Permalink to this headline">#</a></h3>
<p>PySpark requires the following dependencies.</p>
<table class="table">
<colgroup>
<col style="width: 33%" />
<col style="width: 31%" />
<col style="width: 36%" />
</colgroup>
<thead>
<tr class="row-odd"><th class="head"><p>Package</p></th>
<th class="head"><p>Supported version</p></th>
<th class="head"><p>Note</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p><cite>py4j</cite></p></td>
<td><p>&gt;=0.10.9.7</p></td>
<td><p>Required to interact with JVM</p></td>
</tr>
</tbody>
</table>
<p>Additional libraries that enhance functionality but are not included in the installation packages:</p>
<ul class="simple">
<li><p><strong>memory-profiler</strong>: Used for PySpark UDF memory profiling, <code class="docutils literal notranslate"><span class="pre">spark.profile.show(...)</span></code> and <code class="docutils literal notranslate"><span class="pre">spark.sql.pyspark.udf.profiler</span></code>.</p></li>
</ul>
<p>Note that PySpark requires Java 17 or later with <code class="docutils literal notranslate"><span class="pre">JAVA_HOME</span></code> properly set and refer to <a class="reference external" href="https://spark.apache.org/docs/4.0.0-preview1/#downloading">Downloading</a>.</p>
</section>
<section id="optional-dependencies">
<span id="id2"></span><h3>Optional dependencies<a class="headerlink" href="#optional-dependencies" title="Permalink to this headline">#</a></h3>
<p>PySpark has several optional dependencies that enhance its functionality for specific modules.
These dependencies are only required for certain features and are not necessary for the basic functionality of PySpark.
If these optional dependencies are not installed, PySpark will function correctly for basic operations but will raise an <code class="docutils literal notranslate"><span class="pre">ImportError</span></code>
when you try to use features that require these dependencies.</p>
<section id="spark-connect">
<h4>Spark Connect<a class="headerlink" href="#spark-connect" title="Permalink to this headline">#</a></h4>
<p>Installable with <code class="docutils literal notranslate"><span class="pre">pip</span> <span class="pre">install</span> <span class="pre">&quot;pyspark[connect]&quot;</span></code>.</p>
<table class="table">
<colgroup>
<col style="width: 38%" />
<col style="width: 25%" />
<col style="width: 38%" />
</colgroup>
<thead>
<tr class="row-odd"><th class="head"><p>Package</p></th>
<th class="head"><p>Supported version</p></th>
<th class="head"><p>Note</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p><cite>pandas</cite></p></td>
<td><p>&gt;=2.0.0</p></td>
<td><p>Required for Spark Connect</p></td>
</tr>
<tr class="row-odd"><td><p><cite>pyarrow</cite></p></td>
<td><p>&gt;=10.0.0</p></td>
<td><p>Required for Spark Connect</p></td>
</tr>
<tr class="row-even"><td><p><cite>grpcio</cite></p></td>
<td><p>&gt;=1.62.0</p></td>
<td><p>Required for Spark Connect</p></td>
</tr>
<tr class="row-odd"><td><p><cite>grpcio-status</cite></p></td>
<td><p>&gt;=1.62.0</p></td>
<td><p>Required for Spark Connect</p></td>
</tr>
<tr class="row-even"><td><p><cite>googleapis-common-protos</cite></p></td>
<td><p>&gt;=1.56.4</p></td>
<td><p>Required for Spark Connect</p></td>
</tr>
</tbody>
</table>
</section>
<section id="spark-sql">
<h4>Spark SQL<a class="headerlink" href="#spark-sql" title="Permalink to this headline">#</a></h4>
<p>Installable with <code class="docutils literal notranslate"><span class="pre">pip</span> <span class="pre">install</span> <span class="pre">&quot;pyspark[sql]&quot;</span></code>.</p>
<table class="table">
<colgroup>
<col style="width: 19%" />
<col style="width: 35%" />
<col style="width: 46%" />
</colgroup>
<thead>
<tr class="row-odd"><th class="head"><p>Package</p></th>
<th class="head"><p>Supported version</p></th>
<th class="head"><p>Note</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p><cite>pandas</cite></p></td>
<td><p>&gt;=2.0.0</p></td>
<td><p>Required for Spark SQL</p></td>
</tr>
<tr class="row-odd"><td><p><cite>pyarrow</cite></p></td>
<td><p>&gt;=10.0.0</p></td>
<td><p>Required for Spark SQL</p></td>
</tr>
</tbody>
</table>
</section>
<section id="pandas-api-on-spark">
<h4>Pandas API on Spark<a class="headerlink" href="#pandas-api-on-spark" title="Permalink to this headline">#</a></h4>
<p>Installable with <code class="docutils literal notranslate"><span class="pre">pip</span> <span class="pre">install</span> <span class="pre">&quot;pyspark[pandas_on_spark]&quot;</span></code>.</p>
<table class="table">
<colgroup>
<col style="width: 16%" />
<col style="width: 29%" />
<col style="width: 55%" />
</colgroup>
<thead>
<tr class="row-odd"><th class="head"><p>Package</p></th>
<th class="head"><p>Supported version</p></th>
<th class="head"><p>Note</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p><cite>pandas</cite></p></td>
<td><p>&gt;=2.0.0</p></td>
<td><p>Required for Pandas API on Spark</p></td>
</tr>
<tr class="row-odd"><td><p><cite>pyarrow</cite></p></td>
<td><p>&gt;=10.0.0</p></td>
<td><p>Required for Pandas API on Spark</p></td>
</tr>
</tbody>
</table>
<p>Additional libraries that enhance functionality but are not included in the installation packages:</p>
<ul class="simple">
<li><p><strong>mlflow</strong>: Required for <code class="docutils literal notranslate"><span class="pre">pyspark.pandas.mlflow</span></code>.</p></li>
<li><p><strong>plotly</strong>: Provide plotting for visualization. It is recommended using <strong>plotly</strong> over <strong>matplotlib</strong>.</p></li>
<li><p><strong>matplotlib</strong>: Provide plotting for visualization. The default is <strong>plotly</strong>.</p></li>
</ul>
</section>
<section id="mllib-dataframe-based-api">
<h4>MLlib DataFrame-based API<a class="headerlink" href="#mllib-dataframe-based-api" title="Permalink to this headline">#</a></h4>
<p>Installable with <code class="docutils literal notranslate"><span class="pre">pip</span> <span class="pre">install</span> <span class="pre">&quot;pyspark[ml]&quot;</span></code>.</p>
<table class="table">
<colgroup>
<col style="width: 11%" />
<col style="width: 27%" />
<col style="width: 61%" />
</colgroup>
<thead>
<tr class="row-odd"><th class="head"><p>Package</p></th>
<th class="head"><p>Supported version</p></th>
<th class="head"><p>Note</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p><cite>numpy</cite></p></td>
<td><p>&gt;=1.21</p></td>
<td><p>Required for MLlib DataFrame-based API</p></td>
</tr>
</tbody>
</table>
<p>Additional libraries that enhance functionality but are not included in the installation packages:</p>
<ul class="simple">
<li><p><strong>scipy</strong>: Required for SciPy integration.</p></li>
<li><p><strong>scikit-learn</strong>: Required for implementing machine learning algorithms.</p></li>
<li><p><strong>torch</strong>: Required for machine learning model training.</p></li>
<li><p><strong>torchvision</strong>: Required for supporting image and video processing.</p></li>
<li><p><strong>torcheval</strong>: Required for facilitating model evaluation metrics.</p></li>
<li><p><strong>deepspeed</strong>: Required for providing high-performance model training optimizations. Installable on non-Darwin systems.</p></li>
</ul>
</section>
<section id="mllib">
<h4>MLlib<a class="headerlink" href="#mllib" title="Permalink to this headline">#</a></h4>
<p>Installable with <code class="docutils literal notranslate"><span class="pre">pip</span> <span class="pre">install</span> <span class="pre">&quot;pyspark[mllib]&quot;</span></code>.</p>
<table class="table">
<colgroup>
<col style="width: 17%" />
<col style="width: 40%" />
<col style="width: 43%" />
</colgroup>
<thead>
<tr class="row-odd"><th class="head"><p>Package</p></th>
<th class="head"><p>Supported version</p></th>
<th class="head"><p>Note</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p><cite>numpy</cite></p></td>
<td><p>&gt;=1.21</p></td>
<td><p>Required for MLlib</p></td>
</tr>
</tbody>
</table>
</section>
</section>
</section>
</section>
</article>
<footer class="bd-footer-article">
<div class="footer-article-items footer-article__inner">
<div class="footer-article-item"><!-- Previous / next buttons -->
<div class="prev-next-area">
<a class="left-prev"
href="index.html"
title="previous page">
<i class="fa-solid fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">Getting Started</p>
</div>
</a>
<a class="right-next"
href="quickstart_df.html"
title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">Quickstart: DataFrame</p>
</div>
<i class="fa-solid fa-angle-right"></i>
</a>
</div></div>
</div>
</footer>
</div>
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
<div class="sidebar-secondary-item">
<div class="page-toc tocsection onthispage">
<i class="fa-solid fa-list"></i> On this page
</div>
<nav class="bd-toc-nav page-toc">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#python-versions-supported">Python Versions Supported</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#using-pypi">Using PyPI</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#python-spark-connect-client">Python Spark Connect Client</a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#using-conda">Using Conda</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#manually-downloading">Manually Downloading</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#installing-from-source">Installing from Source</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#dependencies">Dependencies</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#required-dependencies">Required dependencies</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#optional-dependencies">Optional dependencies</a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#spark-connect">Spark Connect</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#spark-sql">Spark SQL</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#pandas-api-on-spark">Pandas API on Spark</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#mllib-dataframe-based-api">MLlib DataFrame-based API</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#mllib">MLlib</a></li>
</ul>
</li>
</ul>
</li>
</ul>
</nav></div>
<div class="sidebar-secondary-item">
<div class="tocsection sourcelink">
<a href="../_sources/getting_started/install.rst.txt">
<i class="fa-solid fa-file-lines"></i> Show Source
</a>
</div>
</div>
</div></div>
</div>
<footer class="bd-footer-content">
</footer>
</main>
</div>
</div>
<!-- Scripts loaded after <body> so the DOM is not blocked -->
<script src="../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script>
<footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">
<div class="footer-items__start">
<div class="footer-item"><p class="copyright">
Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
</p></div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0.
<br/>
</p>
</div>
</div>
<div class="footer-items__end">
<div class="footer-item"><p class="theme-version">
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3.
</p></div>
</div>
</div>
</footer>
</body>
</html>