blob: 7351d0ec38adf2136fdf403445248e7aa4d4896f [file] [log] [blame]
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
<title>Installation &#8212; PySpark 3.1.3 documentation</title>
<link rel="stylesheet" href="../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css">
<link rel="stylesheet"
href="../_static/vendor/fontawesome/5.13.0/css/all.min.css">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
<link rel="preload" as="font" type="font/woff2" crossorigin
href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">
<link rel="stylesheet"
href="../_static/vendor/open-sans_all/1.44.1/index.css">
<link rel="stylesheet"
href="../_static/vendor/lato_latin-ext/1.44.1/index.css">
<link rel="stylesheet" href="../_static/basic.css" type="text/css" />
<link rel="stylesheet" href="../_static/pygments.css" type="text/css" />
<link rel="stylesheet" type="text/css" href="../_static/css/pyspark.css" />
<link rel="preload" as="script" href="../_static/js/index.3da636dd464baa7582d2.js">
<script id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script>
<script src="../_static/jquery.js"></script>
<script src="../_static/underscore.js"></script>
<script src="../_static/doctools.js"></script>
<script src="../_static/language_data.js"></script>
<script src="../_static/copybutton.js"></script>
<script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script>
<script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script>
<link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/getting_started/install.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Quickstart" href="quickstart.html" />
<link rel="prev" title="Getting Started" href="index.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="en" />
</head>
<body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80">
<nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main">
<div class="container-xl">
<a class="navbar-brand" href="../index.html">
<img src="../_static/spark-logo-reverse.png" class="logo" alt="logo" />
</a>
<button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div id="navbar-menu" class="col-lg-9 collapse navbar-collapse">
<ul id="navbar-main-elements" class="navbar-nav mr-auto">
<li class="nav-item active">
<a class="nav-link" href="index.html">Getting Started</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../user_guide/index.html">User Guide</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../reference/index.html">API Reference</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../development/index.html">Development</a>
</li>
<li class="nav-item ">
<a class="nav-link" href="../migration_guide/index.html">Migration Guide</a>
</li>
</ul>
<ul class="navbar-nav">
</ul>
</div>
</div>
</nav>
<div class="container-xl">
<div class="row">
<div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../search.html" method="get">
<i class="icon fas fa-search"></i>
<input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form>
<nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
<div class="bd-toc-item active">
<ul class="nav bd-sidenav">
<li class="active">
<a href="">Installation</a>
</li>
<li class="">
<a href="quickstart.html">Quickstart</a>
</li>
</ul>
</nav>
</div>
<div class="d-none d-xl-block col-xl-2 bd-toc">
<div class="tocsection onthispage pt-5 pb-3">
<i class="fas fa-list"></i> On this page
</div>
<nav id="bd-toc-nav">
<ul class="nav section-nav flex-column">
<li class="nav-item toc-entry toc-h2">
<a href="#python-version-supported" class="nav-link">Python Version Supported</a>
</li>
<li class="nav-item toc-entry toc-h2">
<a href="#using-pypi" class="nav-link">Using PyPI</a>
</li>
<li class="nav-item toc-entry toc-h2">
<a href="#using-conda" class="nav-link">Using Conda</a>
</li>
<li class="nav-item toc-entry toc-h2">
<a href="#manually-downloading" class="nav-link">Manually Downloading</a>
</li>
<li class="nav-item toc-entry toc-h2">
<a href="#installing-from-source" class="nav-link">Installing from Source</a>
</li>
<li class="nav-item toc-entry toc-h2">
<a href="#dependencies" class="nav-link">Dependencies</a>
</li>
</ul>
</nav>
</div>
<main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main">
<div>
<section id="installation">
<h1>Installation<a class="headerlink" href="#installation" title="Permalink to this headline"></a></h1>
<p>PySpark is included in the official releases of Spark available in the <a class="reference external" href="https://spark.apache.org/downloads.html">Apache Spark website</a>.
For Python users, PySpark also provides <code class="docutils literal notranslate"><span class="pre">pip</span></code> installation from PyPI. This is usually for local usage or as
a client to connect to a cluster instead of setting up a cluster itself.</p>
<p>This page includes instructions for installing PySpark by using pip, Conda, downloading manually,
and building from the source.</p>
<section id="python-version-supported">
<h2>Python Version Supported<a class="headerlink" href="#python-version-supported" title="Permalink to this headline"></a></h2>
<p>Python 3.6 and above.</p>
</section>
<section id="using-pypi">
<h2>Using PyPI<a class="headerlink" href="#using-pypi" title="Permalink to this headline"></a></h2>
<p>PySpark installation using <a class="reference external" href="https://pypi.org/project/pyspark/">PyPI</a> is as follows:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip install pyspark
</pre></div>
</div>
<p>If you want to install extra dependencies for a specific component, you can install it as below:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip install pyspark<span class="o">[</span>sql<span class="o">]</span>
</pre></div>
</div>
<p>For PySpark with/without a specific Hadoop version, you can install it by using <code class="docutils literal notranslate"><span class="pre">PYSPARK_HADOOP_VERSION</span></code> environment variables as below:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">PYSPARK_HADOOP_VERSION</span><span class="o">=</span><span class="m">2</span>.7 pip install pyspark
</pre></div>
</div>
<p>The default distribution uses Hadoop 3.2 and Hive 2.3. If users specify different versions of Hadoop, the pip installation automatically
downloads a different version and use it in PySpark. Downloading it can take a while depending on
the network and the mirror chosen. <code class="docutils literal notranslate"><span class="pre">PYSPARK_RELEASE_MIRROR</span></code> can be set to manually choose the mirror for faster downloading.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">PYSPARK_RELEASE_MIRROR</span><span class="o">=</span>http://mirror.apache-kr.org <span class="nv">PYSPARK_HADOOP_VERSION</span><span class="o">=</span><span class="m">2</span>.7 pip install
</pre></div>
</div>
<p>It is recommended to use <code class="docutils literal notranslate"><span class="pre">-v</span></code> option in <code class="docutils literal notranslate"><span class="pre">pip</span></code> to track the installation and download status.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">PYSPARK_HADOOP_VERSION</span><span class="o">=</span><span class="m">2</span>.7 pip install pyspark -v
</pre></div>
</div>
<p>Supported values in <code class="docutils literal notranslate"><span class="pre">PYSPARK_HADOOP_VERSION</span></code> are:</p>
<ul class="simple">
<li><p><code class="docutils literal notranslate"><span class="pre">without</span></code>: Spark pre-built with user-provided Apache Hadoop</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">2.7</span></code>: Spark pre-built for Apache Hadoop 2.7</p></li>
<li><p><code class="docutils literal notranslate"><span class="pre">3.2</span></code>: Spark pre-built for Apache Hadoop 3.2 and later (default)</p></li>
</ul>
<p>Note that this installation way of PySpark with/without a specific Hadoop version is experimental. It can change or be removed between minor releases.</p>
</section>
<section id="using-conda">
<h2>Using Conda<a class="headerlink" href="#using-conda" title="Permalink to this headline"></a></h2>
<p>Conda is an open-source package management and environment management system which is a part of
the <a class="reference external" href="https://docs.continuum.io/anaconda/">Anaconda</a> distribution. It is both cross-platform and
language agnostic. In practice, Conda can replace both <a class="reference external" href="https://pip.pypa.io/en/latest/">pip</a> and
<a class="reference external" href="https://virtualenv.pypa.io/en/latest/">virtualenv</a>.</p>
<p>Create new virtual environment from your terminal as shown below:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>conda create -n pyspark_env
</pre></div>
</div>
<p>After the virtual environment is created, it should be visible under the list of Conda environments
which can be seen using the following command:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>conda env list
</pre></div>
</div>
<p>Now activate the newly created environment with the following command:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>conda activate pyspark_env
</pre></div>
</div>
<p>You can install pyspark by <a class="reference external" href="#using-pypi">Using PyPI</a> to install PySpark in the newly created
environment, for example as below. It will install PySpark under the new virtual environment
<code class="docutils literal notranslate"><span class="pre">pyspark_env</span></code> created above.</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip install pyspark
</pre></div>
</div>
<p>Alternatively, you can install PySpark from Conda itself as below:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>conda install pyspark
</pre></div>
</div>
<p>However, note that <a class="reference external" href="https://anaconda.org/conda-forge/pyspark">PySpark at Conda</a> is not necessarily
synced with PySpark release cycle because it is maintained by the community separately.</p>
</section>
<section id="manually-downloading">
<h2>Manually Downloading<a class="headerlink" href="#manually-downloading" title="Permalink to this headline"></a></h2>
<p>PySpark is included in the distributions available at the <a class="reference external" href="https://spark.apache.org/downloads.html">Apache Spark website</a>.
You can download a distribution you want from the site. After that, uncompress the tar file into the directory where you want
to install Spark, for example, as below:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>tar xzvf spark-3.0.0-bin-hadoop2.7.tgz
</pre></div>
</div>
<p>Ensure the <code class="docutils literal notranslate"><span class="pre">SPARK_HOME</span></code> environment variable points to the directory where the tar file has been extracted.
Update <code class="docutils literal notranslate"><span class="pre">PYTHONPATH</span></code> environment variable such that it can find the PySpark and Py4J under <code class="docutils literal notranslate"><span class="pre">SPARK_HOME/python/lib</span></code>.
One example of doing this is shown below:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">cd</span> spark-3.0.0-bin-hadoop2.7
<span class="nb">export</span> <span class="nv">SPARK_HOME</span><span class="o">=</span><span class="sb">`</span><span class="nb">pwd</span><span class="sb">`</span>
<span class="nb">export</span> <span class="nv">PYTHONPATH</span><span class="o">=</span><span class="k">$(</span><span class="nv">ZIPS</span><span class="o">=(</span><span class="s2">&quot;</span><span class="nv">$SPARK_HOME</span><span class="s2">&quot;</span>/python/lib/*.zip<span class="k">)</span><span class="p">;</span> <span class="nv">IFS</span><span class="o">=</span>:<span class="p">;</span> <span class="nb">echo</span> <span class="s2">&quot;</span><span class="si">${</span><span class="nv">ZIPS</span><span class="p">[*]</span><span class="si">}</span><span class="s2">&quot;</span><span class="o">)</span>:<span class="nv">$PYTHONPATH</span>
</pre></div>
</div>
</section>
<section id="installing-from-source">
<h2>Installing from Source<a class="headerlink" href="#installing-from-source" title="Permalink to this headline"></a></h2>
<p>To install PySpark from source, refer to <a class="reference external" href="https://spark.apache.org/docs/3.1.3/#downloading">Building Spark</a>.</p>
</section>
<section id="dependencies">
<h2>Dependencies<a class="headerlink" href="#dependencies" title="Permalink to this headline"></a></h2>
<table class="table">
<thead>
<tr class="row-odd"><th class="head"><p>Package</p></th>
<th class="head"><p>Minimum supported version</p></th>
<th class="head"><p>Note</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p><cite>pandas</cite></p></td>
<td><p>0.23.2</p></td>
<td><p>Optional for SQL</p></td>
</tr>
<tr class="row-odd"><td><p><cite>NumPy</cite></p></td>
<td><p>1.7</p></td>
<td><p>Required for ML</p></td>
</tr>
<tr class="row-even"><td><p><cite>pyarrow</cite></p></td>
<td><p>1.0.0</p></td>
<td><p>Optional for SQL</p></td>
</tr>
<tr class="row-odd"><td><p><cite>Py4J</cite></p></td>
<td><p>0.10.9</p></td>
<td><p>Required</p></td>
</tr>
</tbody>
</table>
<p>Note that PySpark requires Java 8 or later with <code class="docutils literal notranslate"><span class="pre">JAVA_HOME</span></code> properly set.
If using JDK 11, set <code class="docutils literal notranslate"><span class="pre">-Dio.netty.tryReflectionSetAccessible=true</span></code> for Arrow related features and refer
to <a class="reference external" href="https://spark.apache.org/docs/3.1.3/building-spark.html">Downloading</a>.</p>
</section>
</section>
</div>
<div class='prev-next-bottom'>
<a class='left-prev' id="prev-link" href="index.html" title="previous page">Getting Started</a>
<a class='right-next' id="next-link" href="quickstart.html" title="next page">Quickstart</a>
</div>
</main>
</div>
</div>
<script src="../_static/js/index.3da636dd464baa7582d2.js"></script>
<footer class="footer mt-5 mt-md-0">
<div class="container">
<p>
&copy; Copyright .<br/>
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/>
</p>
</div>
</footer>
</body>
</html>