| |
| <!DOCTYPE html> |
| |
| <html> |
| <head> |
| <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" /> |
| |
| <title>Installation — PySpark 3.1.3 documentation</title> |
| |
| <link rel="stylesheet" href="../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css"> |
| |
| |
| <link rel="stylesheet" |
| href="../_static/vendor/fontawesome/5.13.0/css/all.min.css"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2"> |
| |
| |
| |
| <link rel="stylesheet" |
| href="../_static/vendor/open-sans_all/1.44.1/index.css"> |
| <link rel="stylesheet" |
| href="../_static/vendor/lato_latin-ext/1.44.1/index.css"> |
| |
| |
| <link rel="stylesheet" href="../_static/basic.css" type="text/css" /> |
| <link rel="stylesheet" href="../_static/pygments.css" type="text/css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/css/pyspark.css" /> |
| |
| <link rel="preload" as="script" href="../_static/js/index.3da636dd464baa7582d2.js"> |
| |
| <script id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script> |
| <script src="../_static/jquery.js"></script> |
| <script src="../_static/underscore.js"></script> |
| <script src="../_static/doctools.js"></script> |
| <script src="../_static/language_data.js"></script> |
| <script src="../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script> |
| <script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script> |
| <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/getting_started/install.html" /> |
| <link rel="search" title="Search" href="../search.html" /> |
| <link rel="next" title="Quickstart" href="quickstart.html" /> |
| <link rel="prev" title="Getting Started" href="index.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="en" /> |
| </head> |
| <body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80"> |
| |
| <nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"> |
| <div class="container-xl"> |
| |
| <a class="navbar-brand" href="../index.html"> |
| |
| <img src="../_static/spark-logo-reverse.png" class="logo" alt="logo" /> |
| |
| </a> |
| <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| |
| <div id="navbar-menu" class="col-lg-9 collapse navbar-collapse"> |
| <ul id="navbar-main-elements" class="navbar-nav mr-auto"> |
| |
| |
| <li class="nav-item active"> |
| <a class="nav-link" href="index.html">Getting Started</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../user_guide/index.html">User Guide</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../reference/index.html">API Reference</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../development/index.html">Development</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../migration_guide/index.html">Migration Guide</a> |
| </li> |
| |
| |
| </ul> |
| |
| |
| |
| |
| <ul class="navbar-nav"> |
| |
| |
| </ul> |
| </div> |
| </div> |
| </nav> |
| |
| |
| <div class="container-xl"> |
| <div class="row"> |
| |
| <div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../search.html" method="get"> |
| <i class="icon fas fa-search"></i> |
| <input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" > |
| </form> |
| <nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation"> |
| |
| <div class="bd-toc-item active"> |
| |
| |
| <ul class="nav bd-sidenav"> |
| |
| |
| |
| |
| <li class="active"> |
| <a href="">Installation</a> |
| </li> |
| |
| |
| |
| <li class=""> |
| <a href="quickstart.html">Quickstart</a> |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </ul> |
| |
| </nav> |
| </div> |
| |
| |
| |
| <div class="d-none d-xl-block col-xl-2 bd-toc"> |
| |
| <div class="tocsection onthispage pt-5 pb-3"> |
| <i class="fas fa-list"></i> On this page |
| </div> |
| |
| <nav id="bd-toc-nav"> |
| <ul class="nav section-nav flex-column"> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#python-version-supported" class="nav-link">Python Version Supported</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#using-pypi" class="nav-link">Using PyPI</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#using-conda" class="nav-link">Using Conda</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#manually-downloading" class="nav-link">Manually Downloading</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#installing-from-source" class="nav-link">Installing from Source</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#dependencies" class="nav-link">Dependencies</a> |
| </li> |
| |
| </ul> |
| </nav> |
| |
| |
| |
| </div> |
| |
| |
| |
| <main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main"> |
| |
| <div> |
| |
| <section id="installation"> |
| <h1>Installation<a class="headerlink" href="#installation" title="Permalink to this headline">¶</a></h1> |
| <p>PySpark is included in the official releases of Spark available in the <a class="reference external" href="https://spark.apache.org/downloads.html">Apache Spark website</a>. |
| For Python users, PySpark also provides <code class="docutils literal notranslate"><span class="pre">pip</span></code> installation from PyPI. This is usually for local usage or as |
| a client to connect to a cluster instead of setting up a cluster itself.</p> |
| <p>This page includes instructions for installing PySpark by using pip, Conda, downloading manually, |
| and building from the source.</p> |
| <section id="python-version-supported"> |
| <h2>Python Version Supported<a class="headerlink" href="#python-version-supported" title="Permalink to this headline">¶</a></h2> |
| <p>Python 3.6 and above.</p> |
| </section> |
| <section id="using-pypi"> |
| <h2>Using PyPI<a class="headerlink" href="#using-pypi" title="Permalink to this headline">¶</a></h2> |
| <p>PySpark installation using <a class="reference external" href="https://pypi.org/project/pyspark/">PyPI</a> is as follows:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip install pyspark |
| </pre></div> |
| </div> |
| <p>If you want to install extra dependencies for a specific component, you can install it as below:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip install pyspark<span class="o">[</span>sql<span class="o">]</span> |
| </pre></div> |
| </div> |
| <p>For PySpark with/without a specific Hadoop version, you can install it by using <code class="docutils literal notranslate"><span class="pre">PYSPARK_HADOOP_VERSION</span></code> environment variables as below:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">PYSPARK_HADOOP_VERSION</span><span class="o">=</span><span class="m">2</span>.7 pip install pyspark |
| </pre></div> |
| </div> |
| <p>The default distribution uses Hadoop 3.2 and Hive 2.3. If users specify different versions of Hadoop, the pip installation automatically |
| downloads a different version and use it in PySpark. Downloading it can take a while depending on |
| the network and the mirror chosen. <code class="docutils literal notranslate"><span class="pre">PYSPARK_RELEASE_MIRROR</span></code> can be set to manually choose the mirror for faster downloading.</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">PYSPARK_RELEASE_MIRROR</span><span class="o">=</span>http://mirror.apache-kr.org <span class="nv">PYSPARK_HADOOP_VERSION</span><span class="o">=</span><span class="m">2</span>.7 pip install |
| </pre></div> |
| </div> |
| <p>It is recommended to use <code class="docutils literal notranslate"><span class="pre">-v</span></code> option in <code class="docutils literal notranslate"><span class="pre">pip</span></code> to track the installation and download status.</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">PYSPARK_HADOOP_VERSION</span><span class="o">=</span><span class="m">2</span>.7 pip install pyspark -v |
| </pre></div> |
| </div> |
| <p>Supported values in <code class="docutils literal notranslate"><span class="pre">PYSPARK_HADOOP_VERSION</span></code> are:</p> |
| <ul class="simple"> |
| <li><p><code class="docutils literal notranslate"><span class="pre">without</span></code>: Spark pre-built with user-provided Apache Hadoop</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">2.7</span></code>: Spark pre-built for Apache Hadoop 2.7</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">3.2</span></code>: Spark pre-built for Apache Hadoop 3.2 and later (default)</p></li> |
| </ul> |
| <p>Note that this installation way of PySpark with/without a specific Hadoop version is experimental. It can change or be removed between minor releases.</p> |
| </section> |
| <section id="using-conda"> |
| <h2>Using Conda<a class="headerlink" href="#using-conda" title="Permalink to this headline">¶</a></h2> |
| <p>Conda is an open-source package management and environment management system which is a part of |
| the <a class="reference external" href="https://docs.continuum.io/anaconda/">Anaconda</a> distribution. It is both cross-platform and |
| language agnostic. In practice, Conda can replace both <a class="reference external" href="https://pip.pypa.io/en/latest/">pip</a> and |
| <a class="reference external" href="https://virtualenv.pypa.io/en/latest/">virtualenv</a>.</p> |
| <p>Create new virtual environment from your terminal as shown below:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>conda create -n pyspark_env |
| </pre></div> |
| </div> |
| <p>After the virtual environment is created, it should be visible under the list of Conda environments |
| which can be seen using the following command:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>conda env list |
| </pre></div> |
| </div> |
| <p>Now activate the newly created environment with the following command:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>conda activate pyspark_env |
| </pre></div> |
| </div> |
| <p>You can install pyspark by <a class="reference external" href="#using-pypi">Using PyPI</a> to install PySpark in the newly created |
| environment, for example as below. It will install PySpark under the new virtual environment |
| <code class="docutils literal notranslate"><span class="pre">pyspark_env</span></code> created above.</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip install pyspark |
| </pre></div> |
| </div> |
| <p>Alternatively, you can install PySpark from Conda itself as below:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>conda install pyspark |
| </pre></div> |
| </div> |
| <p>However, note that <a class="reference external" href="https://anaconda.org/conda-forge/pyspark">PySpark at Conda</a> is not necessarily |
| synced with PySpark release cycle because it is maintained by the community separately.</p> |
| </section> |
| <section id="manually-downloading"> |
| <h2>Manually Downloading<a class="headerlink" href="#manually-downloading" title="Permalink to this headline">¶</a></h2> |
| <p>PySpark is included in the distributions available at the <a class="reference external" href="https://spark.apache.org/downloads.html">Apache Spark website</a>. |
| You can download a distribution you want from the site. After that, uncompress the tar file into the directory where you want |
| to install Spark, for example, as below:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>tar xzvf spark-3.0.0-bin-hadoop2.7.tgz |
| </pre></div> |
| </div> |
| <p>Ensure the <code class="docutils literal notranslate"><span class="pre">SPARK_HOME</span></code> environment variable points to the directory where the tar file has been extracted. |
| Update <code class="docutils literal notranslate"><span class="pre">PYTHONPATH</span></code> environment variable such that it can find the PySpark and Py4J under <code class="docutils literal notranslate"><span class="pre">SPARK_HOME/python/lib</span></code>. |
| One example of doing this is shown below:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">cd</span> spark-3.0.0-bin-hadoop2.7 |
| <span class="nb">export</span> <span class="nv">SPARK_HOME</span><span class="o">=</span><span class="sb">`</span><span class="nb">pwd</span><span class="sb">`</span> |
| <span class="nb">export</span> <span class="nv">PYTHONPATH</span><span class="o">=</span><span class="k">$(</span><span class="nv">ZIPS</span><span class="o">=(</span><span class="s2">"</span><span class="nv">$SPARK_HOME</span><span class="s2">"</span>/python/lib/*.zip<span class="k">)</span><span class="p">;</span> <span class="nv">IFS</span><span class="o">=</span>:<span class="p">;</span> <span class="nb">echo</span> <span class="s2">"</span><span class="si">${</span><span class="nv">ZIPS</span><span class="p">[*]</span><span class="si">}</span><span class="s2">"</span><span class="o">)</span>:<span class="nv">$PYTHONPATH</span> |
| </pre></div> |
| </div> |
| </section> |
| <section id="installing-from-source"> |
| <h2>Installing from Source<a class="headerlink" href="#installing-from-source" title="Permalink to this headline">¶</a></h2> |
| <p>To install PySpark from source, refer to <a class="reference external" href="https://spark.apache.org/docs/3.1.3/#downloading">Building Spark</a>.</p> |
| </section> |
| <section id="dependencies"> |
| <h2>Dependencies<a class="headerlink" href="#dependencies" title="Permalink to this headline">¶</a></h2> |
| <table class="table"> |
| <thead> |
| <tr class="row-odd"><th class="head"><p>Package</p></th> |
| <th class="head"><p>Minimum supported version</p></th> |
| <th class="head"><p>Note</p></th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr class="row-even"><td><p><cite>pandas</cite></p></td> |
| <td><p>0.23.2</p></td> |
| <td><p>Optional for SQL</p></td> |
| </tr> |
| <tr class="row-odd"><td><p><cite>NumPy</cite></p></td> |
| <td><p>1.7</p></td> |
| <td><p>Required for ML</p></td> |
| </tr> |
| <tr class="row-even"><td><p><cite>pyarrow</cite></p></td> |
| <td><p>1.0.0</p></td> |
| <td><p>Optional for SQL</p></td> |
| </tr> |
| <tr class="row-odd"><td><p><cite>Py4J</cite></p></td> |
| <td><p>0.10.9</p></td> |
| <td><p>Required</p></td> |
| </tr> |
| </tbody> |
| </table> |
| <p>Note that PySpark requires Java 8 or later with <code class="docutils literal notranslate"><span class="pre">JAVA_HOME</span></code> properly set. |
| If using JDK 11, set <code class="docutils literal notranslate"><span class="pre">-Dio.netty.tryReflectionSetAccessible=true</span></code> for Arrow related features and refer |
| to <a class="reference external" href="https://spark.apache.org/docs/3.1.3/building-spark.html">Downloading</a>.</p> |
| </section> |
| </section> |
| |
| |
| </div> |
| |
| |
| <div class='prev-next-bottom'> |
| |
| <a class='left-prev' id="prev-link" href="index.html" title="previous page">Getting Started</a> |
| <a class='right-next' id="next-link" href="quickstart.html" title="next page">Quickstart</a> |
| |
| </div> |
| |
| </main> |
| |
| |
| </div> |
| </div> |
| |
| |
| <script src="../_static/js/index.3da636dd464baa7582d2.js"></script> |
| |
| |
| <footer class="footer mt-5 mt-md-0"> |
| <div class="container"> |
| <p> |
| © Copyright .<br/> |
| Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/> |
| </p> |
| </div> |
| </footer> |
| </body> |
| </html> |