| |
| <!DOCTYPE html> |
| |
| <html> |
| <head> |
| <meta charset="utf-8" /> |
| <title>Installation — PySpark 3.3.4 documentation</title> |
| |
| <link rel="stylesheet" href="../_static/css/index.73d71520a4ca3b99cfee5594769eaaae.css"> |
| |
| |
| <link rel="stylesheet" |
| href="../_static/vendor/fontawesome/5.13.0/css/all.min.css"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2"> |
| <link rel="preload" as="font" type="font/woff2" crossorigin |
| href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2"> |
| |
| |
| |
| <link rel="stylesheet" |
| href="../_static/vendor/open-sans_all/1.44.1/index.css"> |
| <link rel="stylesheet" |
| href="../_static/vendor/lato_latin-ext/1.44.1/index.css"> |
| |
| |
| <link rel="stylesheet" href="../_static/basic.css" type="text/css" /> |
| <link rel="stylesheet" href="../_static/pygments.css" type="text/css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/copybutton.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/css/pyspark.css" /> |
| |
| <link rel="preload" as="script" href="../_static/js/index.3da636dd464baa7582d2.js"> |
| |
| <script id="documentation_options" data-url_root="../" src="../_static/documentation_options.js"></script> |
| <script src="../_static/jquery.js"></script> |
| <script src="../_static/underscore.js"></script> |
| <script src="../_static/doctools.js"></script> |
| <script src="../_static/language_data.js"></script> |
| <script src="../_static/clipboard.min.js"></script> |
| <script src="../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script async="async" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/latest.js?config=TeX-AMS-MML_HTMLorMML"></script> |
| <script type="text/x-mathjax-config">MathJax.Hub.Config({"tex2jax": {"inlineMath": [["$", "$"], ["\\(", "\\)"]], "processEscapes": true, "ignoreClass": "document", "processClass": "math|output_area"}})</script> |
| <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/getting_started/install.html" /> |
| <link rel="search" title="Search" href="../search.html" /> |
| <link rel="next" title="Quickstart: DataFrame" href="quickstart_df.html" /> |
| <link rel="prev" title="Getting Started" href="index.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="en" /> |
| </head> |
| <body data-spy="scroll" data-target="#bd-toc-nav" data-offset="80"> |
| |
| <nav class="navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"> |
| <div class="container-xl"> |
| |
| <a class="navbar-brand" href="../index.html"> |
| |
| <img src="../_static/spark-logo-reverse.png" class="logo" alt="logo" /> |
| |
| </a> |
| <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-menu" aria-controls="navbar-menu" aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| |
| <div id="navbar-menu" class="col-lg-9 collapse navbar-collapse"> |
| <ul id="navbar-main-elements" class="navbar-nav mr-auto"> |
| |
| |
| <li class="nav-item active"> |
| <a class="nav-link" href="index.html">Getting Started</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../user_guide/index.html">User Guide</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../reference/index.html">API Reference</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../development/index.html">Development</a> |
| </li> |
| |
| <li class="nav-item "> |
| <a class="nav-link" href="../migration_guide/index.html">Migration Guide</a> |
| </li> |
| |
| |
| </ul> |
| |
| |
| |
| |
| <ul class="navbar-nav"> |
| |
| |
| </ul> |
| </div> |
| </div> |
| </nav> |
| |
| |
| <div class="container-xl"> |
| <div class="row"> |
| |
| <div class="col-12 col-md-3 bd-sidebar"><form class="bd-search d-flex align-items-center" action="../search.html" method="get"> |
| <i class="icon fas fa-search"></i> |
| <input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" > |
| </form> |
| <nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation"> |
| |
| <div class="bd-toc-item active"> |
| |
| |
| <ul class="nav bd-sidenav"> |
| |
| |
| |
| |
| <li class="active"> |
| <a href="">Installation</a> |
| </li> |
| |
| |
| |
| <li class=""> |
| <a href="quickstart_df.html">Quickstart: DataFrame</a> |
| </li> |
| |
| |
| |
| <li class=""> |
| <a href="quickstart_ps.html">Quickstart: Pandas API on Spark</a> |
| </li> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| </ul> |
| |
| </nav> |
| </div> |
| |
| |
| |
| <div class="d-none d-xl-block col-xl-2 bd-toc"> |
| |
| <div class="tocsection onthispage pt-5 pb-3"> |
| <i class="fas fa-list"></i> On this page |
| </div> |
| |
| <nav id="bd-toc-nav"> |
| <ul class="nav section-nav flex-column"> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#python-version-supported" class="nav-link">Python Version Supported</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#using-pypi" class="nav-link">Using PyPI</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#using-conda" class="nav-link">Using Conda</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#manually-downloading" class="nav-link">Manually Downloading</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#installing-from-source" class="nav-link">Installing from Source</a> |
| </li> |
| |
| <li class="nav-item toc-entry toc-h2"> |
| <a href="#dependencies" class="nav-link">Dependencies</a> |
| </li> |
| |
| </ul> |
| </nav> |
| |
| |
| |
| </div> |
| |
| |
| |
| <main class="col-12 col-md-9 col-xl-7 py-md-5 pl-md-5 pr-md-4 bd-content" role="main"> |
| |
| <div> |
| |
| <div class="section" id="installation"> |
| <h1>Installation<a class="headerlink" href="#installation" title="Permalink to this headline">¶</a></h1> |
| <p>PySpark is included in the official releases of Spark available in the <a class="reference external" href="https://spark.apache.org/downloads.html">Apache Spark website</a>. |
| For Python users, PySpark also provides <code class="docutils literal notranslate"><span class="pre">pip</span></code> installation from PyPI. This is usually for local usage or as |
| a client to connect to a cluster instead of setting up a cluster itself.</p> |
| <p>This page includes instructions for installing PySpark by using pip, Conda, downloading manually, |
| and building from the source.</p> |
| <div class="section" id="python-version-supported"> |
| <h2>Python Version Supported<a class="headerlink" href="#python-version-supported" title="Permalink to this headline">¶</a></h2> |
| <p>Python 3.7 and above.</p> |
| </div> |
| <div class="section" id="using-pypi"> |
| <h2>Using PyPI<a class="headerlink" href="#using-pypi" title="Permalink to this headline">¶</a></h2> |
| <p>PySpark installation using <a class="reference external" href="https://pypi.org/project/pyspark/">PyPI</a> is as follows:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip<span class="w"> </span>install<span class="w"> </span>pyspark |
| </pre></div> |
| </div> |
| <p>If you want to install extra dependencies for a specific component, you can install it as below:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Spark SQL</span> |
| pip<span class="w"> </span>install<span class="w"> </span>pyspark<span class="o">[</span>sql<span class="o">]</span> |
| <span class="c1"># pandas API on Spark</span> |
| pip<span class="w"> </span>install<span class="w"> </span>pyspark<span class="o">[</span>pandas_on_spark<span class="o">]</span><span class="w"> </span>plotly<span class="w"> </span><span class="c1"># to plot your data, you can install plotly together.</span> |
| </pre></div> |
| </div> |
| <p>For PySpark with/without a specific Hadoop version, you can install it by using <code class="docutils literal notranslate"><span class="pre">PYSPARK_HADOOP_VERSION</span></code> environment variables as below:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">PYSPARK_HADOOP_VERSION</span><span class="o">=</span><span class="m">2</span><span class="w"> </span>pip<span class="w"> </span>install<span class="w"> </span>pyspark |
| </pre></div> |
| </div> |
| <p>The default distribution uses Hadoop 3.3 and Hive 2.3. If users specify different versions of Hadoop, the pip installation automatically |
| downloads a different version and use it in PySpark. Downloading it can take a while depending on |
| the network and the mirror chosen. <code class="docutils literal notranslate"><span class="pre">PYSPARK_RELEASE_MIRROR</span></code> can be set to manually choose the mirror for faster downloading.</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">PYSPARK_RELEASE_MIRROR</span><span class="o">=</span>http://mirror.apache-kr.org<span class="w"> </span><span class="nv">PYSPARK_HADOOP_VERSION</span><span class="o">=</span><span class="m">2</span><span class="w"> </span>pip<span class="w"> </span>install |
| </pre></div> |
| </div> |
| <p>It is recommended to use <code class="docutils literal notranslate"><span class="pre">-v</span></code> option in <code class="docutils literal notranslate"><span class="pre">pip</span></code> to track the installation and download status.</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">PYSPARK_HADOOP_VERSION</span><span class="o">=</span><span class="m">2</span><span class="w"> </span>pip<span class="w"> </span>install<span class="w"> </span>pyspark<span class="w"> </span>-v |
| </pre></div> |
| </div> |
| <p>Supported values in <code class="docutils literal notranslate"><span class="pre">PYSPARK_HADOOP_VERSION</span></code> are:</p> |
| <ul class="simple"> |
| <li><p><code class="docutils literal notranslate"><span class="pre">without</span></code>: Spark pre-built with user-provided Apache Hadoop</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">2</span></code>: Spark pre-built for Apache Hadoop 2.7</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">3</span></code>: Spark pre-built for Apache Hadoop 3.3 and later (default)</p></li> |
| </ul> |
| <p>Note that this installation way of PySpark with/without a specific Hadoop version is experimental. It can change or be removed between minor releases.</p> |
| </div> |
| <div class="section" id="using-conda"> |
| <h2>Using Conda<a class="headerlink" href="#using-conda" title="Permalink to this headline">¶</a></h2> |
| <p>Conda is an open-source package management and environment management system (developed by |
| <a class="reference external" href="https://www.anaconda.com/">Anaconda</a>), which is best installed through |
| <a class="reference external" href="https://docs.conda.io/en/latest/miniconda.html/">Miniconda</a> or <a class="reference external" href="https://github.com/conda-forge/miniforge/">Miniforge</a>. |
| The tool is both cross-platform and language agnostic, and in practice, conda can replace both |
| <a class="reference external" href="https://pip.pypa.io/en/latest/">pip</a> and <a class="reference external" href="https://virtualenv.pypa.io/en/latest/">virtualenv</a>.</p> |
| <p>Conda uses so-called channels to distribute packages, and together with the default channels by |
| Anaconda itself, the most important channel is <a class="reference external" href="https://conda-forge.org/">conda-forge</a>, which |
| is the community-driven packaging effort that is the most extensive & the most current (and also |
| serves as the upstream for the Anaconda channels in most cases).</p> |
| <p>To create a new conda environment from your terminal and activate it, proceed as shown below:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>conda<span class="w"> </span>create<span class="w"> </span>-n<span class="w"> </span>pyspark_env |
| conda<span class="w"> </span>activate<span class="w"> </span>pyspark_env |
| </pre></div> |
| </div> |
| <p>After activating the environment, use the following command to install pyspark, |
| a python version of your choice, as well as other packages you want to use in |
| the same session as pyspark (you can install in several steps too).</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>conda<span class="w"> </span>install<span class="w"> </span>-c<span class="w"> </span>conda-forge<span class="w"> </span>pyspark<span class="w"> </span><span class="c1"># can also add "python=3.8 some_package [etc.]" here</span> |
| </pre></div> |
| </div> |
| <p>Note that <a class="reference external" href="https://anaconda.org/conda-forge/pyspark">PySpark for conda</a> is maintained |
| separately by the community; while new versions generally get packaged quickly, the |
| availability through conda(-forge) is not directly in sync with the PySpark release cycle.</p> |
| <p>While using pip in a conda environment is technically feasible (with the same command as |
| <a class="reference external" href="#using-pypi">above</a>), this approach is <a class="reference external" href="https://www.anaconda.com/blog/using-pip-in-a-conda-environment/">discouraged</a>, |
| because pip does not interoperate with conda.</p> |
| <p>For a short summary about useful conda commands, see their |
| <a class="reference external" href="https://docs.conda.io/projects/conda/en/latest/user-guide/cheatsheet.html/">cheat sheet</a>.</p> |
| </div> |
| <div class="section" id="manually-downloading"> |
| <h2>Manually Downloading<a class="headerlink" href="#manually-downloading" title="Permalink to this headline">¶</a></h2> |
| <p>PySpark is included in the distributions available at the <a class="reference external" href="https://spark.apache.org/downloads.html">Apache Spark website</a>. |
| You can download a distribution you want from the site. After that, uncompress the tar file into the directory where you want |
| to install Spark, for example, as below:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>tar<span class="w"> </span>xzvf<span class="w"> </span>spark-3.3.0-bin-hadoop3.tgz |
| </pre></div> |
| </div> |
| <p>Ensure the <code class="docutils literal notranslate"><span class="pre">SPARK_HOME</span></code> environment variable points to the directory where the tar file has been extracted. |
| Update <code class="docutils literal notranslate"><span class="pre">PYTHONPATH</span></code> environment variable such that it can find the PySpark and Py4J under <code class="docutils literal notranslate"><span class="pre">SPARK_HOME/python/lib</span></code>. |
| One example of doing this is shown below:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">cd</span><span class="w"> </span>spark-3.3.0-bin-hadoop3 |
| <span class="nb">export</span><span class="w"> </span><span class="nv">SPARK_HOME</span><span class="o">=</span><span class="sb">`</span><span class="nb">pwd</span><span class="sb">`</span> |
| <span class="nb">export</span><span class="w"> </span><span class="nv">PYTHONPATH</span><span class="o">=</span><span class="k">$(</span><span class="nv">ZIPS</span><span class="o">=(</span><span class="s2">"</span><span class="nv">$SPARK_HOME</span><span class="s2">"</span>/python/lib/*.zip<span class="k">)</span><span class="p">;</span><span class="w"> </span><span class="nv">IFS</span><span class="o">=</span>:<span class="p">;</span><span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">"</span><span class="si">${</span><span class="nv">ZIPS</span><span class="p">[*]</span><span class="si">}</span><span class="s2">"</span><span class="o">)</span>:<span class="nv">$PYTHONPATH</span> |
| </pre></div> |
| </div> |
| </div> |
| <div class="section" id="installing-from-source"> |
| <h2>Installing from Source<a class="headerlink" href="#installing-from-source" title="Permalink to this headline">¶</a></h2> |
| <p>To install PySpark from source, refer to <a class="reference external" href="https://spark.apache.org/docs/3.3.4/building-spark.html">Building Spark</a>.</p> |
| </div> |
| <div class="section" id="dependencies"> |
| <h2>Dependencies<a class="headerlink" href="#dependencies" title="Permalink to this headline">¶</a></h2> |
| <table class="table"> |
| <colgroup> |
| <col style="width: 13%" /> |
| <col style="width: 25%" /> |
| <col style="width: 62%" /> |
| </colgroup> |
| <thead> |
| <tr class="row-odd"><th class="head"><p>Package</p></th> |
| <th class="head"><p>Minimum supported version</p></th> |
| <th class="head"><p>Note</p></th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr class="row-even"><td><p><cite>pandas</cite></p></td> |
| <td><p>1.0.5</p></td> |
| <td><p>Optional for Spark SQL</p></td> |
| </tr> |
| <tr class="row-odd"><td><p><cite>pyarrow</cite></p></td> |
| <td><p>1.0.0</p></td> |
| <td><p>Optional for Spark SQL</p></td> |
| </tr> |
| <tr class="row-even"><td><p><cite>py4j</cite></p></td> |
| <td><p>0.10.9.5</p></td> |
| <td><p>Required</p></td> |
| </tr> |
| <tr class="row-odd"><td><p><cite>pandas</cite></p></td> |
| <td><p>1.0.5</p></td> |
| <td><p>Required for pandas API on Spark</p></td> |
| </tr> |
| <tr class="row-even"><td><p><cite>pyarrow</cite></p></td> |
| <td><p>1.0.0</p></td> |
| <td><p>Required for pandas API on Spark</p></td> |
| </tr> |
| <tr class="row-odd"><td><p><cite>numpy</cite></p></td> |
| <td><p>1.15</p></td> |
| <td><p>Required for pandas API on Spark and MLLib DataFrame-based API</p></td> |
| </tr> |
| </tbody> |
| </table> |
| <p>Note that PySpark requires Java 8 or later with <code class="docutils literal notranslate"><span class="pre">JAVA_HOME</span></code> properly set. |
| If using JDK 11, set <code class="docutils literal notranslate"><span class="pre">-Dio.netty.tryReflectionSetAccessible=true</span></code> for Arrow related features and refer |
| to <a class="reference external" href="https://spark.apache.org/docs/3.3.4/#downloading">Downloading</a>.</p> |
| <p>Note for AArch64 (ARM64) users: PyArrow is required by PySpark SQL, but PyArrow support for AArch64 |
| is introduced in PyArrow 4.0.0. If PySpark installation fails on AArch64 due to PyArrow |
| installation errors, you can install PyArrow >= 4.0.0 as below:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip<span class="w"> </span>install<span class="w"> </span><span class="s2">"pyarrow>=4.0.0"</span><span class="w"> </span>--prefer-binary |
| </pre></div> |
| </div> |
| </div> |
| </div> |
| |
| |
| </div> |
| |
| |
| <div class='prev-next-bottom'> |
| |
| <a class='left-prev' id="prev-link" href="index.html" title="previous page">Getting Started</a> |
| <a class='right-next' id="next-link" href="quickstart_df.html" title="next page">Quickstart: DataFrame</a> |
| |
| </div> |
| |
| </main> |
| |
| |
| </div> |
| </div> |
| |
| |
| <script src="../_static/js/index.3da636dd464baa7582d2.js"></script> |
| |
| |
| <footer class="footer mt-5 mt-md-0"> |
| <div class="container"> |
| <p> |
| © Copyright .<br/> |
| Created using <a href="http://sphinx-doc.org/">Sphinx</a> 3.0.4.<br/> |
| </p> |
| </div> |
| </footer> |
| </body> |
| </html> |