blob: fa3a32a8d2efd3a55c851a644da893c231dc3424 [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" />
<title>Installation &#8212; PySpark 3.5.4 documentation</title>
<link href="../_static/styles/theme.css?digest=1999514e3f237ded88cf" rel="stylesheet">
<link href="../_static/styles/pydata-sphinx-theme.css?digest=1999514e3f237ded88cf" rel="stylesheet">
<link rel="stylesheet"
href="../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet" href="../_static/styles/pydata-sphinx-theme.css" type="text/css" />
<link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css" />
<link rel="stylesheet" type="text/css" href="../_static/css/pyspark.css" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf">
<script id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
<script src="../_static/jquery.js"></script>
<script src="../_static/underscore.js"></script>
<script src="../_static/doctools.js"></script>
<script src="../_static/language_data.js"></script>
<script src="../_static/clipboard.min.js"></script>
<script src="../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/getting_started/install.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Quickstart: DataFrame" href="quickstart_df.html" />
<link rel="prev" title="Getting Started" href="index.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="None">
<!-- Google Analytics -->
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<div class="container-fluid" id="banner"></div>
<nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"><div class="container-xl">
<div id="navbar-start">
<a class="navbar-brand" href="../index.html">
<img src="../_static/spark-logo-reverse.png" class="logo" alt="logo">
</a>
</div>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-collapsible" aria-controls="navbar-collapsible" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar-collapsible" class="col-lg-9 collapse navbar-collapse">
<div id="navbar-center" class="mr-auto">
<div class="navbar-center-item">
<ul id="navbar-main-elements" class="navbar-nav">
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../index.html">
Overview
</a>
</li>
<li class="toctree-l1 current active nav-item">
<a class="reference internal nav-link" href="index.html">
Getting Started
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../user_guide/index.html">
User Guides
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../reference/index.html">
API Reference
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../development/index.html">
Development
</a>
</li>
<li class="toctree-l1 nav-item">
<a class="reference internal nav-link" href="../migration_guide/index.html">
Migration Guides
</a>
</li>
</ul>
</div>
</div>
<div id="navbar-end">
<div class="navbar-end-item">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<div id="version-button" class="dropdown">
<button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown">
3.5.4
<span class="caret"></span>
</button>
<div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button">
<!-- dropdown will be populated by javascript on page load -->
</div>
</div>
<script type="text/javascript">
// Function to construct the target URL from the JSON components
function buildURL(entry) {
var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja
template = template.replace("{version}", entry.version);
return template;
}
// Function to check if corresponding page path exists in other version of docs
// and, if so, go there instead of the homepage of the other docs version
function checkPageExistsAndRedirect(event) {
const currentFilePath = "getting_started/install.html",
otherDocsHomepage = event.target.getAttribute("href");
let tryUrl = `${otherDocsHomepage}${currentFilePath}`;
$.ajax({
type: 'HEAD',
url: tryUrl,
// if the page exists, go there
success: function() {
location.href = tryUrl;
}
}).fail(function() {
location.href = otherDocsHomepage;
});
return false;
}
// Function to populate the version switcher
(function () {
// get JSON config
$.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) {
// create the nodes first (before AJAX calls) to ensure the order is
// correct (for now, links will go to doc version homepage)
$.each(data, function(index, entry) {
// if no custom name specified (e.g., "latest"), use version string
if (!("name" in entry)) {
entry.name = entry.version;
}
// construct the appropriate URL, and add it to the dropdown
entry.url = buildURL(entry);
const node = document.createElement("a");
node.setAttribute("class", "list-group-item list-group-item-action py-1");
node.setAttribute("href", `${entry.url}`);
node.textContent = `${entry.name}`;
node.onclick = checkPageExistsAndRedirect;
$("#version_switcher").append(node);
});
});
})();
</script>
</div>
</div>
</div>
</div>
</nav>
<div class="container-xl">
<div class="row">
<!-- Only show if we have sidebars configured, else just a small margin -->
<div class="col-12 col-md-3 bd-sidebar">
<div class="sidebar-start-items"><form class="bd-search d-flex align-items-center" action="../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form><nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
<ul class="current nav bd-sidenav">
<li class="toctree-l1 current active">
<a class="current reference internal" href="#">
Installation
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="quickstart_df.html">
Quickstart: DataFrame
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="quickstart_connect.html">
Quickstart: Spark Connect
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="quickstart_ps.html">
Quickstart: Pandas API on Spark
</a>
</li>
<li class="toctree-l1">
<a class="reference internal" href="testing_pyspark.html">
Testing PySpark
</a>
</li>
</ul>
</div>
</nav>
</div>
<div class="sidebar-end-items">
</div>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
<div class="toc-item">
<div class="tocsection onthispage pt-5 pb-3">
<i class="fas fa-list"></i> On this page
</div>
<nav id="bd-toc-nav">
<ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#python-versions-supported">
Python Versions Supported
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#using-pypi">
Using PyPI
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#using-conda">
Using Conda
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#manually-downloading">
Manually Downloading
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#installing-from-source">
Installing from Source
</a>
</li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link" href="#dependencies">
Dependencies
</a>
</li>
</ul>
</nav>
</div>
<div class="toc-item">
</div>
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<div class="section" id="installation">
<h1>Installation<a class="headerlink" href="#installation" title="Permalink to this headline"></a></h1>
<p>PySpark is included in the official releases of Spark available in the <a class="reference external" href="https://spark.apache.org/downloads.html">Apache Spark website</a>.
For Python users, PySpark also provides <code class="docutils literal notranslate"><span class="pre">pip</span></code> installation from PyPI. This is usually for local usage or as
a client to connect to a cluster instead of setting up a cluster itself.</p>
<p>This page includes instructions for installing PySpark by using pip, Conda, downloading manually,
and building from the source.</p>
<div class="section" id="python-versions-supported">
<h2>Python Versions Supported<a class="headerlink" href="#python-versions-supported" title="Permalink to this headline"></a></h2>
<p>Python 3.8 and above.</p>
</div>
<div class="section" id="using-pypi">
<h2>Using PyPI<a class="headerlink" href="#using-pypi" title="Permalink to this headline"></a></h2>
<p>PySpark installation using <a class="reference external" href="https://pypi.org/project/pyspark/">PyPI</a> is as follows:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip<span class="w"> </span>install<span class="w"> </span>pyspark
</pre></div>
</div>
<p>If you want to install extra dependencies for a specific component, you can install it as below:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Spark SQL</span>
pip<span class="w"> </span>install<span class="w"> </span>pyspark<span class="o">[</span>sql<span class="o">]</span>
<span class="c1"># pandas API on Spark</span>
pip<span class="w"> </span>install<span class="w"> </span>pyspark<span class="o">[</span>pandas_on_spark<span class="o">]</span><span class="w"> </span>plotly<span class="w"> </span><span class="c1"># to plot your data, you can install plotly together.</span>
<span class="c1"># Spark Connect</span>
pip<span class="w"> </span>install<span class="w"> </span>pyspark<span class="o">[</span>connect<span class="o">]</span>
</pre></div>
</div>
<p>For PySpark with/without a specific Hadoop version, you can install it by using <code class="docutils literal notranslate"><span class="pre">PYSPARK_HADOOP_VERSION</span></code> environment variables as below:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">PYSPARK_HADOOP_VERSION</span><span class="o">=</span><span class="m">3</span><span class="w"> </span>pip<span class="w"> </span>install<span class="w"> </span>pyspark
</pre></div>
</div>
<p>The default distribution uses Hadoop 3.3 and Hive 2.3. If users specify different versions of Hadoop, the pip installation automatically
downloads a different version and uses it in PySpark. Downloading it can take a while depending on
the network and the mirror chosen. <code class="docutils literal notranslate"><span class="pre">PYSPARK_RELEASE_MIRROR</span></code> can be set to manually choose the mirror for faster downloading.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">PYSPARK_RELEASE_MIRROR</span><span class="o">=</span>http://mirror.apache-kr.org<span class="w"> </span><span class="nv">PYSPARK_HADOOP_VERSION</span><span class="o">=</span><span class="m">3</span><span class="w"> </span>pip<span class="w"> </span>install
</pre></div>
</div>
<p>It is recommended to use <code class="docutils literal notranslate"><span class="pre">-v</span></code> option in <code class="docutils literal notranslate"><span class="pre">pip</span></code> to track the installation and download status.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">PYSPARK_HADOOP_VERSION</span><span class="o">=</span><span class="m">3</span><span class="w"> </span>pip<span class="w"> </span>install<span class="w"> </span>pyspark<span class="w"> </span>-v
</pre></div>
</div>
<p>Supported values in <code class="docutils literal notranslate"><span class="pre">PYSPARK_HADOOP_VERSION</span></code> are:</p>
<ul class="simple">
<li><p><code class="docutils literal notranslate"><span class="pre">without</span></code>: Spark pre-built with user-provided Apache Hadoop</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">3</span></code>: Spark pre-built for Apache Hadoop 3.3 and later (default)</p></li>
</ul>
<p>Note that this installation of PySpark with/without a specific Hadoop version is experimental. It can change or be removed between minor releases.</p>
</div>
<div class="section" id="using-conda">
<h2>Using Conda<a class="headerlink" href="#using-conda" title="Permalink to this headline"></a></h2>
<p>Conda is an open-source package management and environment management system (developed by
<a class="reference external" href="https://www.anaconda.com/">Anaconda</a>), which is best installed through
<a class="reference external" href="https://docs.conda.io/en/latest/miniconda.html/">Miniconda</a> or <a class="reference external" href="https://github.com/conda-forge/miniforge/">Miniforge</a>.
The tool is both cross-platform and language agnostic, and in practice, conda can replace both
<a class="reference external" href="https://pip.pypa.io/en/latest/">pip</a> and <a class="reference external" href="https://virtualenv.pypa.io/en/latest/">virtualenv</a>.</p>
<p>Conda uses so-called channels to distribute packages, and together with the default channels by
Anaconda itself, the most important channel is <a class="reference external" href="https://conda-forge.org/">conda-forge</a>, which
is the community-driven packaging effort that is the most extensive &amp; the most current (and also
serves as the upstream for the Anaconda channels in most cases).</p>
<p>To create a new conda environment from your terminal and activate it, proceed as shown below:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>conda<span class="w"> </span>create<span class="w"> </span>-n<span class="w"> </span>pyspark_env
conda<span class="w"> </span>activate<span class="w"> </span>pyspark_env
</pre></div>
</div>
<p>After activating the environment, use the following command to install pyspark,
a python version of your choice, as well as other packages you want to use in
the same session as pyspark (you can install in several steps too).</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>conda<span class="w"> </span>install<span class="w"> </span>-c<span class="w"> </span>conda-forge<span class="w"> </span>pyspark<span class="w"> </span><span class="c1"># can also add &quot;python=3.8 some_package [etc.]&quot; here</span>
</pre></div>
</div>
<p>Note that <a class="reference external" href="https://anaconda.org/conda-forge/pyspark">PySpark for conda</a> is maintained
separately by the community; while new versions generally get packaged quickly, the
availability through conda(-forge) is not directly in sync with the PySpark release cycle.</p>
<p>While using pip in a conda environment is technically feasible (with the same command as
<a class="reference external" href="#using-pypi">above</a>), this approach is <a class="reference external" href="https://www.anaconda.com/blog/using-pip-in-a-conda-environment/">discouraged</a>,
because pip does not interoperate with conda.</p>
<p>For a short summary about useful conda commands, see their
<a class="reference external" href="https://docs.conda.io/projects/conda/en/latest/user-guide/cheatsheet.html/">cheat sheet</a>.</p>
</div>
<div class="section" id="manually-downloading">
<h2>Manually Downloading<a class="headerlink" href="#manually-downloading" title="Permalink to this headline"></a></h2>
<p>PySpark is included in the distributions available at the <a class="reference external" href="https://spark.apache.org/downloads.html">Apache Spark website</a>.
You can download a distribution you want from the site. After that, uncompress the tar file into the directory where you want
to install Spark, for example, as below:</p>
<pre class="literal-block">tar xzvf spark-3.5.4-bin-hadoop3.tgz</pre>
<p>Ensure the <code class="docutils literal notranslate"><span class="pre">SPARK_HOME</span></code> environment variable points to the directory where the tar file has been extracted.
Update <code class="docutils literal notranslate"><span class="pre">PYTHONPATH</span></code> environment variable such that it can find the PySpark and Py4J under <code class="docutils literal notranslate"><span class="pre">SPARK_HOME/python/lib</span></code>.
One example of doing this is shown below:</p>
<pre class="literal-block">cd spark-3.5.4-bin-hadoop3
export SPARK_HOME=`pwd`
export PYTHONPATH=$(ZIPS=(&quot;$SPARK_HOME&quot;/python/lib/<em>.zip); IFS=:; echo &quot;${ZIPS[</em>]}&quot;):$PYTHONPATH</pre>
</div>
<div class="section" id="installing-from-source">
<h2>Installing from Source<a class="headerlink" href="#installing-from-source" title="Permalink to this headline"></a></h2>
<p>To install PySpark from source, refer to <a class="reference external" href="https://spark.apache.org/docs/3.5.4/building-spark.html">Building Spark</a>.</p>
</div>
<div class="section" id="dependencies">
<h2>Dependencies<a class="headerlink" href="#dependencies" title="Permalink to this headline"></a></h2>
<table class="table">
<colgroup>
<col style="width: 19%" />
<col style="width: 18%" />
<col style="width: 63%" />
</colgroup>
<thead>
<tr class="row-odd"><th class="head"><p>Package</p></th>
<th class="head"><p>Supported version Note</p></th>
<th class="head"></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p><cite>py4j</cite></p></td>
<td><p>&gt;=0.10.9.7</p></td>
<td><p>Required</p></td>
</tr>
<tr class="row-odd"><td><p><cite>pandas</cite></p></td>
<td><p>&gt;=1.0.5</p></td>
<td><p>Required for pandas API on Spark and Spark Connect; Optional for Spark SQL</p></td>
</tr>
<tr class="row-even"><td><p><cite>pyarrow</cite></p></td>
<td><p>&gt;=4.0.0,&lt;13.0.0</p></td>
<td><p>Required for pandas API on Spark and Spark Connect; Optional for Spark SQL</p></td>
</tr>
<tr class="row-odd"><td><p><cite>numpy</cite></p></td>
<td><p>&gt;=1.15</p></td>
<td><p>Required for pandas API on Spark and MLLib DataFrame-based API; Optional for Spark SQL</p></td>
</tr>
<tr class="row-even"><td><p><cite>grpcio</cite></p></td>
<td><p>&gt;=1.48,&lt;1.57</p></td>
<td><p>Required for Spark Connect</p></td>
</tr>
<tr class="row-odd"><td><p><cite>grpcio-status</cite></p></td>
<td><p>&gt;=1.48,&lt;1.57</p></td>
<td><p>Required for Spark Connect</p></td>
</tr>
<tr class="row-even"><td><p><cite>googleapis-common-protos</cite></p></td>
<td><p>==1.56.4</p></td>
<td><p>Required for Spark Connect</p></td>
</tr>
</tbody>
</table>
<p>Note that PySpark requires Java 8 (except prior to 8u371), 11 or 17 with <code class="docutils literal notranslate"><span class="pre">JAVA_HOME</span></code> properly set.
If using JDK 11, set <code class="docutils literal notranslate"><span class="pre">-Dio.netty.tryReflectionSetAccessible=true</span></code> for Arrow related features and refer
to <a class="reference external" href="https://spark.apache.org/docs/3.5.4/#downloading">Downloading</a>.</p>
</div>
</div>
</div>
<!-- Previous / next buttons -->
<div class='prev-next-area'>
<a class='left-prev' id="prev-link" href="index.html" title="previous page">
<i class="fas fa-angle-left"></i>
<div class="prev-next-info">
<p class="prev-next-subtitle">previous</p>
<p class="prev-next-title">Getting Started</p>
</div>
</a>
<a class='right-next' id="next-link" href="quickstart_df.html" title="next page">
<div class="prev-next-info">
<p class="prev-next-subtitle">next</p>
<p class="prev-next-title">Quickstart: DataFrame</p>
</div>
<i class="fas fa-angle-right"></i>
</a>
</div>
</main>
</div>
</div>
<script src="../_static/scripts/pydata-sphinx-theme.js?digest=1999514e3f237ded88cf"></script>
<footer class="footer mt-5 mt-md-0">
<div class="container">
<div class="footer-item">
<p class="copyright">
&copy; Copyright .<br>
</p>
</div>
<div class="footer-item">
<p class="sphinx-version">
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br>
</p>
</div>
</div>
</footer>
</body>
</html>