| |
| |
| <!DOCTYPE html> |
| |
| |
| <html > |
| |
| <head> |
| <meta charset="utf-8" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.17.1: http://docutils.sourceforge.net/" /> |
| |
| <title>Installation — PySpark 4.0.0-preview1 documentation</title> |
| |
| |
| |
| <script data-cfasync="false"> |
| document.documentElement.dataset.mode = localStorage.getItem("mode") || ""; |
| document.documentElement.dataset.theme = localStorage.getItem("theme") || "light"; |
| </script> |
| |
| <!-- Loaded before other Sphinx assets --> |
| <link href="../_static/styles/theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../_static/styles/bootstrap.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link href="../_static/styles/pydata-sphinx-theme.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| |
| |
| <link href="../_static/vendor/fontawesome/6.1.2/css/all.min.css?digest=e353d410970836974a52" rel="stylesheet" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-solid-900.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-brands-400.woff2" /> |
| <link rel="preload" as="font" type="font/woff2" crossorigin href="../_static/vendor/fontawesome/6.1.2/webfonts/fa-regular-400.woff2" /> |
| |
| <link rel="stylesheet" type="text/css" href="../_static/pygments.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/copybutton.css" /> |
| <link rel="stylesheet" type="text/css" href="../_static/css/pyspark.css" /> |
| |
| <!-- Pre-loaded scripts that we'll load fully later --> |
| <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=e353d410970836974a52" /> |
| <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52" /> |
| |
| <script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script> |
| <script src="../_static/jquery.js"></script> |
| <script src="../_static/underscore.js"></script> |
| <script src="../_static/doctools.js"></script> |
| <script src="../_static/clipboard.min.js"></script> |
| <script src="../_static/copybutton.js"></script> |
| <script crossorigin="anonymous" integrity="sha256-Ae2Vz/4ePdIu6ZyI/5ZGsYnb+m0JlOmKPjt6XZ9JJkA=" src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.3.4/require.min.js"></script> |
| <script>DOCUMENTATION_OPTIONS.pagename = 'getting_started/install';</script> |
| <link rel="canonical" href="https://spark.apache.org/docs/latest/api/python/getting_started/install.html" /> |
| <link rel="search" title="Search" href="../search.html" /> |
| <link rel="next" title="Quickstart: DataFrame" href="quickstart_df.html" /> |
| <link rel="prev" title="Getting Started" href="index.html" /> |
| <meta name="viewport" content="width=device-width, initial-scale=1" /> |
| <meta name="docsearch:language" content="None"> |
| |
| |
| <!-- Matomo --> |
| <script type="text/javascript"> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| </head> |
| |
| |
| <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode=""> |
| |
| |
| |
| <a class="skip-link" href="#main-content">Skip to main content</a> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__primary" |
| id="__primary"/> |
| <label class="overlay overlay-primary" for="__primary"></label> |
| |
| <input type="checkbox" |
| class="sidebar-toggle" |
| name="__secondary" |
| id="__secondary"/> |
| <label class="overlay overlay-secondary" for="__secondary"></label> |
| |
| <div class="search-button__wrapper"> |
| <div class="search-button__overlay"></div> |
| <div class="search-button__search-container"> |
| <form class="bd-search d-flex align-items-center" |
| action="../search.html" |
| method="get"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| <input type="search" |
| class="form-control" |
| name="q" |
| id="search-input" |
| placeholder="Search the docs ..." |
| aria-label="Search the docs ..." |
| autocomplete="off" |
| autocorrect="off" |
| autocapitalize="off" |
| spellcheck="false"/> |
| <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span> |
| </form></div> |
| </div> |
| |
| <nav class="bd-header navbar navbar-expand-lg bd-navbar"> |
| <div class="bd-header__inner bd-page-width"> |
| <label class="sidebar-toggle primary-toggle" for="__primary"> |
| <span class="fa-solid fa-bars"></span> |
| </label> |
| |
| <div class="navbar-header-items__start"> |
| |
| <div class="navbar-item"> |
| |
| |
| <a class="navbar-brand logo" href="../index.html"> |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| <img src="../_static/spark-logo-light.png" class="logo__image only-light" alt="Logo image"/> |
| <script>document.write(`<img src="../_static/spark-logo-dark.png" class="logo__image only-dark" alt="Logo image"/>`);</script> |
| |
| |
| </a></div> |
| |
| </div> |
| |
| |
| <div class="col-lg-9 navbar-header-items"> |
| |
| <div class="me-auto navbar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item current active"> |
| <a class="nav-link nav-internal" href="index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| <div class="navbar-header-items__end"> |
| |
| <div class="navbar-item navbar-persistent--container"> |
| |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview1 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "getting_started/install.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| |
| <div class="navbar-persistent--mobile"> |
| <script> |
| document.write(` |
| <button class="btn btn-sm navbar-btn search-button search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <i class="fa-solid fa-magnifying-glass"></i> |
| </button> |
| `); |
| </script> |
| </div> |
| |
| |
| |
| <label class="sidebar-toggle secondary-toggle" for="__secondary"> |
| <span class="fa-solid fa-outdent"></span> |
| </label> |
| |
| </div> |
| |
| </nav> |
| |
| <div class="bd-container"> |
| <div class="bd-container__inner bd-page-width"> |
| |
| <div class="bd-sidebar-primary bd-sidebar"> |
| |
| |
| |
| <div class="sidebar-header-items sidebar-primary__section"> |
| |
| |
| <div class="sidebar-header-items__center"> |
| |
| <div class="navbar-item"><nav class="navbar-nav"> |
| <p class="sidebar-header-items__title" |
| role="heading" |
| aria-level="1" |
| aria-label="Site Navigation"> |
| Site Navigation |
| </p> |
| <ul class="bd-navbar-elements navbar-nav"> |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../index.html"> |
| Overview |
| </a> |
| </li> |
| |
| |
| <li class="nav-item current active"> |
| <a class="nav-link nav-internal" href="index.html"> |
| Getting Started |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../user_guide/index.html"> |
| User Guides |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../reference/index.html"> |
| API Reference |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../development/index.html"> |
| Development |
| </a> |
| </li> |
| |
| |
| <li class="nav-item"> |
| <a class="nav-link nav-internal" href="../migration_guide/index.html"> |
| Migration Guides |
| </a> |
| </li> |
| |
| </ul> |
| </nav></div> |
| |
| </div> |
| |
| |
| |
| <div class="sidebar-header-items__end"> |
| |
| <div class="navbar-item"><!-- |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <div id="version-button" class="dropdown"> |
| <button type="button" class="btn btn-secondary btn-sm navbar-btn dropdown-toggle" id="version_switcher_button" data-toggle="dropdown"> |
| 4.0.0-preview1 |
| <span class="caret"></span> |
| </button> |
| <div id="version_switcher" class="dropdown-menu list-group-flush py-0" aria-labelledby="version_switcher_button"> |
| <!-- dropdown will be populated by javascript on page load --> |
| </div> |
| </div> |
| |
| <script type="text/javascript"> |
| // Function to construct the target URL from the JSON components |
| function buildURL(entry) { |
| var template = "https://spark.apache.org/docs/{version}/api/python/index.html"; // supplied by jinja |
| template = template.replace("{version}", entry.version); |
| return template; |
| } |
| |
| // Function to check if corresponding page path exists in other version of docs |
| // and, if so, go there instead of the homepage of the other docs version |
| function checkPageExistsAndRedirect(event) { |
| const currentFilePath = "getting_started/install.html", |
| otherDocsHomepage = event.target.getAttribute("href"); |
| let tryUrl = `${otherDocsHomepage}${currentFilePath}`; |
| $.ajax({ |
| type: 'HEAD', |
| url: tryUrl, |
| // if the page exists, go there |
| success: function() { |
| location.href = tryUrl; |
| } |
| }).fail(function() { |
| location.href = otherDocsHomepage; |
| }); |
| return false; |
| } |
| |
| // Function to populate the version switcher |
| (function () { |
| // get JSON config |
| $.getJSON("https://spark.apache.org/static/versions.json", function(data, textStatus, jqXHR) { |
| // create the nodes first (before AJAX calls) to ensure the order is |
| // correct (for now, links will go to doc version homepage) |
| $.each(data, function(index, entry) { |
| // if no custom name specified (e.g., "latest"), use version string |
| if (!("name" in entry)) { |
| entry.name = entry.version; |
| } |
| // construct the appropriate URL, and add it to the dropdown |
| entry.url = buildURL(entry); |
| const node = document.createElement("a"); |
| node.setAttribute("class", "list-group-item list-group-item-action py-1"); |
| node.setAttribute("href", `${entry.url}`); |
| node.textContent = `${entry.name}`; |
| node.onclick = checkPageExistsAndRedirect; |
| $("#version_switcher").append(node); |
| }); |
| }); |
| })(); |
| </script></div> |
| |
| <div class="navbar-item"> |
| <script> |
| document.write(` |
| <button class="theme-switch-button btn btn-sm btn-outline-primary navbar-btn rounded-circle" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip"> |
| <span class="theme-switch" data-mode="light"><i class="fa-solid fa-sun"></i></span> |
| <span class="theme-switch" data-mode="dark"><i class="fa-solid fa-moon"></i></span> |
| <span class="theme-switch" data-mode="auto"><i class="fa-solid fa-circle-half-stroke"></i></span> |
| </button> |
| `); |
| </script></div> |
| |
| <div class="navbar-item"><ul class="navbar-icon-links navbar-nav" |
| aria-label="Icon Links"> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://github.com/apache/spark" title="GitHub" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-brands fa-github"></i></span> |
| <label class="sr-only">GitHub</label></a> |
| </li> |
| <li class="nav-item"> |
| |
| |
| |
| |
| |
| |
| |
| |
| <a href="https://pypi.org/project/pyspark" title="PyPI" class="nav-link" rel="noopener" target="_blank" data-bs-toggle="tooltip" data-bs-placement="bottom"><span><i class="fa-solid fa-box"></i></span> |
| <label class="sr-only">PyPI</label></a> |
| </li> |
| </ul></div> |
| |
| </div> |
| |
| </div> |
| |
| <div class="sidebar-primary-items__start sidebar-primary__section"> |
| <div class="sidebar-primary-item"><nav class="bd-docs-nav bd-links" |
| aria-label="Section Navigation"> |
| <p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p> |
| <div class="bd-toc-item navbar-nav"><ul class="current nav bd-sidenav"> |
| <li class="toctree-l1 current active"><a class="current reference internal" href="#">Installation</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="quickstart_df.html">Quickstart: DataFrame</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="quickstart_connect.html">Quickstart: Spark Connect</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="quickstart_ps.html">Quickstart: Pandas API on Spark</a></li> |
| <li class="toctree-l1"><a class="reference internal" href="testing_pyspark.html">Testing PySpark</a></li> |
| </ul> |
| </div> |
| </nav></div> |
| </div> |
| |
| |
| <div class="sidebar-primary-items__end sidebar-primary__section"> |
| </div> |
| |
| <div id="rtd-footer-container"></div> |
| |
| |
| </div> |
| |
| <main id="main-content" class="bd-main"> |
| |
| |
| <div class="bd-content"> |
| <div class="bd-article-container"> |
| |
| <div class="bd-header-article"> |
| <div class="header-article-items header-article__inner"> |
| |
| <div class="header-article-items__start"> |
| |
| <div class="header-article-item"> |
| |
| |
| |
| <nav aria-label="Breadcrumbs"> |
| <ul class="bd-breadcrumbs" role="navigation" aria-label="Breadcrumb"> |
| |
| <li class="breadcrumb-item breadcrumb-home"> |
| <a href="../index.html" class="nav-link" aria-label="Home"> |
| <i class="fa-solid fa-home"></i> |
| </a> |
| </li> |
| |
| <li class="breadcrumb-item"><a href="index.html" class="nav-link">Getting Started</a></li> |
| |
| <li class="breadcrumb-item active" aria-current="page">Installation</li> |
| </ul> |
| </nav> |
| </div> |
| |
| </div> |
| |
| |
| </div> |
| </div> |
| |
| |
| |
| |
| <div id="searchbox"></div> |
| <article class="bd-article" role="main"> |
| |
| <section id="installation"> |
| <h1>Installation<a class="headerlink" href="#installation" title="Permalink to this headline">#</a></h1> |
| <p>PySpark is included in the official releases of Spark available in the <a class="reference external" href="https://spark.apache.org/downloads.html">Apache Spark website</a>. |
| For Python users, PySpark also provides <code class="docutils literal notranslate"><span class="pre">pip</span></code> installation from PyPI. This is usually for local usage or as |
| a client to connect to a cluster instead of setting up a cluster itself.</p> |
| <p>This page includes instructions for installing PySpark by using pip, Conda, downloading manually, |
| and building from the source.</p> |
| <section id="python-versions-supported"> |
| <h2>Python Versions Supported<a class="headerlink" href="#python-versions-supported" title="Permalink to this headline">#</a></h2> |
| <p>Python 3.9 and above.</p> |
| </section> |
| <section id="using-pypi"> |
| <h2>Using PyPI<a class="headerlink" href="#using-pypi" title="Permalink to this headline">#</a></h2> |
| <p>PySpark installation using <a class="reference external" href="https://pypi.org/project/pyspark/">PyPI (pyspark)</a> is as follows:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip<span class="w"> </span>install<span class="w"> </span>pyspark |
| </pre></div> |
| </div> |
| <p>If you want to install extra dependencies for a specific component, you can install it as below:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="c1"># Spark SQL</span> |
| pip<span class="w"> </span>install<span class="w"> </span>pyspark<span class="o">[</span>sql<span class="o">]</span> |
| <span class="c1"># pandas API on Spark</span> |
| pip<span class="w"> </span>install<span class="w"> </span>pyspark<span class="o">[</span>pandas_on_spark<span class="o">]</span><span class="w"> </span>plotly<span class="w"> </span><span class="c1"># to plot your data, you can install plotly together.</span> |
| <span class="c1"># Spark Connect</span> |
| pip<span class="w"> </span>install<span class="w"> </span>pyspark<span class="o">[</span>connect<span class="o">]</span> |
| </pre></div> |
| </div> |
| <p>See <a class="reference internal" href="#optional-dependencies"><span class="std std-ref">Optional dependencies</span></a> for more detail about extra dependencies.</p> |
| <p>For PySpark with/without a specific Hadoop version, you can install it by using <code class="docutils literal notranslate"><span class="pre">PYSPARK_HADOOP_VERSION</span></code> environment variables as below:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">PYSPARK_HADOOP_VERSION</span><span class="o">=</span><span class="m">3</span><span class="w"> </span>pip<span class="w"> </span>install<span class="w"> </span>pyspark |
| </pre></div> |
| </div> |
| <p>The default distribution uses Hadoop 3.3 and Hive 2.3. If users specify different versions of Hadoop, the pip installation automatically |
| downloads a different version and uses it in PySpark. Downloading it can take a while depending on |
| the network and the mirror chosen. <code class="docutils literal notranslate"><span class="pre">PYSPARK_RELEASE_MIRROR</span></code> can be set to manually choose the mirror for faster downloading.</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">PYSPARK_RELEASE_MIRROR</span><span class="o">=</span>http://mirror.apache-kr.org<span class="w"> </span><span class="nv">PYSPARK_HADOOP_VERSION</span><span class="o">=</span><span class="m">3</span><span class="w"> </span>pip<span class="w"> </span>install |
| </pre></div> |
| </div> |
| <p>It is recommended to use <code class="docutils literal notranslate"><span class="pre">-v</span></code> option in <code class="docutils literal notranslate"><span class="pre">pip</span></code> to track the installation and download status.</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nv">PYSPARK_HADOOP_VERSION</span><span class="o">=</span><span class="m">3</span><span class="w"> </span>pip<span class="w"> </span>install<span class="w"> </span>pyspark<span class="w"> </span>-v |
| </pre></div> |
| </div> |
| <p>Supported values in <code class="docutils literal notranslate"><span class="pre">PYSPARK_HADOOP_VERSION</span></code> are:</p> |
| <ul class="simple"> |
| <li><p><code class="docutils literal notranslate"><span class="pre">without</span></code>: Spark pre-built with user-provided Apache Hadoop</p></li> |
| <li><p><code class="docutils literal notranslate"><span class="pre">3</span></code>: Spark pre-built for Apache Hadoop 3.3 and later (default)</p></li> |
| </ul> |
| <p>Note that this installation of PySpark with/without a specific Hadoop version is experimental. It can change or be removed between minor releases.</p> |
| <section id="python-spark-connect-client"> |
| <h3>Python Spark Connect Client<a class="headerlink" href="#python-spark-connect-client" title="Permalink to this headline">#</a></h3> |
| <p>The Python Spark Connect client is a pure Python library that does not rely on any non-Python dependencies such as jars and JRE in your environment. |
| To install the Python Spark Connect client via <a class="reference external" href="https://pypi.org/project/pyspark-connect/">PyPI (pyspark-connect)</a>, execute the following command:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip<span class="w"> </span>install<span class="w"> </span>pyspark-connect |
| </pre></div> |
| </div> |
| <p>See also <a class="reference external" href="quickstart_connect.html">Quickstart: Spark Connect</a> for how to use it.</p> |
| </section> |
| </section> |
| <section id="using-conda"> |
| <h2>Using Conda<a class="headerlink" href="#using-conda" title="Permalink to this headline">#</a></h2> |
| <p>Conda is an open-source package management and environment management system (developed by |
| <a class="reference external" href="https://www.anaconda.com/">Anaconda</a>), which is best installed through |
| <a class="reference external" href="https://docs.conda.io/en/latest/miniconda.html">Miniconda</a> or <a class="reference external" href="https://github.com/conda-forge/miniforge/">Miniforge</a>. |
| The tool is both cross-platform and language agnostic, and in practice, conda can replace both |
| <a class="reference external" href="https://pip.pypa.io/en/latest/">pip</a> and <a class="reference external" href="https://virtualenv.pypa.io/en/latest/">virtualenv</a>.</p> |
| <p>Conda uses so-called channels to distribute packages, and together with the default channels by |
| Anaconda itself, the most important channel is <a class="reference external" href="https://conda-forge.org/">conda-forge</a>, which |
| is the community-driven packaging effort that is the most extensive & the most current (and also |
| serves as the upstream for the Anaconda channels in most cases).</p> |
| <p>To create a new conda environment from your terminal and activate it, proceed as shown below:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>conda<span class="w"> </span>create<span class="w"> </span>-n<span class="w"> </span>pyspark_env |
| conda<span class="w"> </span>activate<span class="w"> </span>pyspark_env |
| </pre></div> |
| </div> |
| <p>After activating the environment, use the following command to install pyspark, |
| a python version of your choice, as well as other packages you want to use in |
| the same session as pyspark (you can install in several steps too).</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>conda<span class="w"> </span>install<span class="w"> </span>-c<span class="w"> </span>conda-forge<span class="w"> </span>pyspark<span class="w"> </span><span class="c1"># can also add "python=3.9 some_package [etc.]" here</span> |
| </pre></div> |
| </div> |
| <p>Note that <a class="reference external" href="https://anaconda.org/conda-forge/pyspark">PySpark for conda</a> is maintained |
| separately by the community; while new versions generally get packaged quickly, the |
| availability through conda(-forge) is not directly in sync with the PySpark release cycle.</p> |
| <p>While using pip in a conda environment is technically feasible (with the same command as |
| <a class="reference internal" href="#using-pypi"><span class="std std-ref">above</span></a>), this approach is <a class="reference external" href="https://www.anaconda.com/blog/using-pip-in-a-conda-environment/">discouraged</a>, |
| because pip does not interoperate with conda.</p> |
| <p>For a short summary about useful conda commands, see their |
| <a class="reference external" href="https://docs.conda.io/projects/conda/en/latest/user-guide/cheatsheet.html">cheat sheet</a>.</p> |
| </section> |
| <section id="manually-downloading"> |
| <h2>Manually Downloading<a class="headerlink" href="#manually-downloading" title="Permalink to this headline">#</a></h2> |
| <p>PySpark is included in the distributions available at the <a class="reference external" href="https://spark.apache.org/downloads.html">Apache Spark website</a>. |
| You can download a distribution you want from the site. After that, uncompress the tar file into the directory where you want |
| to install Spark, for example, as below:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>tar<span class="w"> </span>xzvf<span class="w"> </span>spark-<span class="se">\ </span><span class="p">|</span>release<span class="p">|</span><span class="se">\-</span>bin-hadoop3.tgz |
| </pre></div> |
| </div> |
| <p>Ensure the <code class="docutils literal notranslate"><span class="pre">SPARK_HOME</span></code> environment variable points to the directory where the tar file has been extracted. |
| Update <code class="docutils literal notranslate"><span class="pre">PYTHONPATH</span></code> environment variable such that it can find the PySpark and Py4J under <code class="docutils literal notranslate"><span class="pre">SPARK_HOME/python/lib</span></code>. |
| One example of doing this is shown below:</p> |
| <div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">cd</span><span class="w"> </span>spark-<span class="se">\ </span><span class="p">|</span>release<span class="p">|</span><span class="se">\-</span>bin-hadoop3 |
| <span class="nb">export</span><span class="w"> </span><span class="nv">SPARK_HOME</span><span class="o">=</span><span class="sb">`</span><span class="nb">pwd</span><span class="sb">`</span> |
| <span class="nb">export</span><span class="w"> </span><span class="nv">PYTHONPATH</span><span class="o">=</span><span class="k">$(</span><span class="nv">ZIPS</span><span class="o">=(</span><span class="s2">"</span><span class="nv">$SPARK_HOME</span><span class="s2">"</span>/python/lib/*.zip<span class="k">)</span><span class="p">;</span><span class="w"> </span><span class="nv">IFS</span><span class="o">=</span>:<span class="p">;</span><span class="w"> </span><span class="nb">echo</span><span class="w"> </span><span class="s2">"</span><span class="si">${</span><span class="nv">ZIPS</span><span class="p">[*]</span><span class="si">}</span><span class="s2">"</span><span class="o">)</span>:<span class="nv">$PYTHONPATH</span> |
| </pre></div> |
| </div> |
| </section> |
| <section id="installing-from-source"> |
| <h2>Installing from Source<a class="headerlink" href="#installing-from-source" title="Permalink to this headline">#</a></h2> |
| <p>To install PySpark from source, refer to <a class="reference external" href="https://spark.apache.org/docs/4.0.0-preview1/building-spark.html">Building Spark</a>.</p> |
| </section> |
| <section id="dependencies"> |
| <h2>Dependencies<a class="headerlink" href="#dependencies" title="Permalink to this headline">#</a></h2> |
| <section id="required-dependencies"> |
| <h3>Required dependencies<a class="headerlink" href="#required-dependencies" title="Permalink to this headline">#</a></h3> |
| <p>PySpark requires the following dependencies.</p> |
| <table class="table"> |
| <colgroup> |
| <col style="width: 33%" /> |
| <col style="width: 31%" /> |
| <col style="width: 36%" /> |
| </colgroup> |
| <thead> |
| <tr class="row-odd"><th class="head"><p>Package</p></th> |
| <th class="head"><p>Supported version</p></th> |
| <th class="head"><p>Note</p></th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr class="row-even"><td><p><cite>py4j</cite></p></td> |
| <td><p>>=0.10.9.7</p></td> |
| <td><p>Required to interact with JVM</p></td> |
| </tr> |
| </tbody> |
| </table> |
| <p>Additional libraries that enhance functionality but are not included in the installation packages:</p> |
| <ul class="simple"> |
| <li><p><strong>memory-profiler</strong>: Used for PySpark UDF memory profiling, <code class="docutils literal notranslate"><span class="pre">spark.profile.show(...)</span></code> and <code class="docutils literal notranslate"><span class="pre">spark.sql.pyspark.udf.profiler</span></code>.</p></li> |
| </ul> |
| <p>Note that PySpark requires Java 17 or later with <code class="docutils literal notranslate"><span class="pre">JAVA_HOME</span></code> properly set and refer to <a class="reference external" href="https://spark.apache.org/docs/4.0.0-preview1/#downloading">Downloading</a>.</p> |
| </section> |
| <section id="optional-dependencies"> |
| <span id="id2"></span><h3>Optional dependencies<a class="headerlink" href="#optional-dependencies" title="Permalink to this headline">#</a></h3> |
| <p>PySpark has several optional dependencies that enhance its functionality for specific modules. |
| These dependencies are only required for certain features and are not necessary for the basic functionality of PySpark. |
| If these optional dependencies are not installed, PySpark will function correctly for basic operations but will raise an <code class="docutils literal notranslate"><span class="pre">ImportError</span></code> |
| when you try to use features that require these dependencies.</p> |
| <section id="spark-connect"> |
| <h4>Spark Connect<a class="headerlink" href="#spark-connect" title="Permalink to this headline">#</a></h4> |
| <p>Installable with <code class="docutils literal notranslate"><span class="pre">pip</span> <span class="pre">install</span> <span class="pre">"pyspark[connect]"</span></code>.</p> |
| <table class="table"> |
| <colgroup> |
| <col style="width: 38%" /> |
| <col style="width: 25%" /> |
| <col style="width: 38%" /> |
| </colgroup> |
| <thead> |
| <tr class="row-odd"><th class="head"><p>Package</p></th> |
| <th class="head"><p>Supported version</p></th> |
| <th class="head"><p>Note</p></th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr class="row-even"><td><p><cite>pandas</cite></p></td> |
| <td><p>>=2.0.0</p></td> |
| <td><p>Required for Spark Connect</p></td> |
| </tr> |
| <tr class="row-odd"><td><p><cite>pyarrow</cite></p></td> |
| <td><p>>=10.0.0</p></td> |
| <td><p>Required for Spark Connect</p></td> |
| </tr> |
| <tr class="row-even"><td><p><cite>grpcio</cite></p></td> |
| <td><p>>=1.62.0</p></td> |
| <td><p>Required for Spark Connect</p></td> |
| </tr> |
| <tr class="row-odd"><td><p><cite>grpcio-status</cite></p></td> |
| <td><p>>=1.62.0</p></td> |
| <td><p>Required for Spark Connect</p></td> |
| </tr> |
| <tr class="row-even"><td><p><cite>googleapis-common-protos</cite></p></td> |
| <td><p>>=1.56.4</p></td> |
| <td><p>Required for Spark Connect</p></td> |
| </tr> |
| </tbody> |
| </table> |
| </section> |
| <section id="spark-sql"> |
| <h4>Spark SQL<a class="headerlink" href="#spark-sql" title="Permalink to this headline">#</a></h4> |
| <p>Installable with <code class="docutils literal notranslate"><span class="pre">pip</span> <span class="pre">install</span> <span class="pre">"pyspark[sql]"</span></code>.</p> |
| <table class="table"> |
| <colgroup> |
| <col style="width: 19%" /> |
| <col style="width: 35%" /> |
| <col style="width: 46%" /> |
| </colgroup> |
| <thead> |
| <tr class="row-odd"><th class="head"><p>Package</p></th> |
| <th class="head"><p>Supported version</p></th> |
| <th class="head"><p>Note</p></th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr class="row-even"><td><p><cite>pandas</cite></p></td> |
| <td><p>>=2.0.0</p></td> |
| <td><p>Required for Spark SQL</p></td> |
| </tr> |
| <tr class="row-odd"><td><p><cite>pyarrow</cite></p></td> |
| <td><p>>=10.0.0</p></td> |
| <td><p>Required for Spark SQL</p></td> |
| </tr> |
| </tbody> |
| </table> |
| </section> |
| <section id="pandas-api-on-spark"> |
| <h4>Pandas API on Spark<a class="headerlink" href="#pandas-api-on-spark" title="Permalink to this headline">#</a></h4> |
| <p>Installable with <code class="docutils literal notranslate"><span class="pre">pip</span> <span class="pre">install</span> <span class="pre">"pyspark[pandas_on_spark]"</span></code>.</p> |
| <table class="table"> |
| <colgroup> |
| <col style="width: 16%" /> |
| <col style="width: 29%" /> |
| <col style="width: 55%" /> |
| </colgroup> |
| <thead> |
| <tr class="row-odd"><th class="head"><p>Package</p></th> |
| <th class="head"><p>Supported version</p></th> |
| <th class="head"><p>Note</p></th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr class="row-even"><td><p><cite>pandas</cite></p></td> |
| <td><p>>=2.0.0</p></td> |
| <td><p>Required for Pandas API on Spark</p></td> |
| </tr> |
| <tr class="row-odd"><td><p><cite>pyarrow</cite></p></td> |
| <td><p>>=10.0.0</p></td> |
| <td><p>Required for Pandas API on Spark</p></td> |
| </tr> |
| </tbody> |
| </table> |
| <p>Additional libraries that enhance functionality but are not included in the installation packages:</p> |
| <ul class="simple"> |
| <li><p><strong>mlflow</strong>: Required for <code class="docutils literal notranslate"><span class="pre">pyspark.pandas.mlflow</span></code>.</p></li> |
| <li><p><strong>plotly</strong>: Provide plotting for visualization. It is recommended using <strong>plotly</strong> over <strong>matplotlib</strong>.</p></li> |
| <li><p><strong>matplotlib</strong>: Provide plotting for visualization. The default is <strong>plotly</strong>.</p></li> |
| </ul> |
| </section> |
| <section id="mllib-dataframe-based-api"> |
| <h4>MLlib DataFrame-based API<a class="headerlink" href="#mllib-dataframe-based-api" title="Permalink to this headline">#</a></h4> |
| <p>Installable with <code class="docutils literal notranslate"><span class="pre">pip</span> <span class="pre">install</span> <span class="pre">"pyspark[ml]"</span></code>.</p> |
| <table class="table"> |
| <colgroup> |
| <col style="width: 11%" /> |
| <col style="width: 27%" /> |
| <col style="width: 61%" /> |
| </colgroup> |
| <thead> |
| <tr class="row-odd"><th class="head"><p>Package</p></th> |
| <th class="head"><p>Supported version</p></th> |
| <th class="head"><p>Note</p></th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr class="row-even"><td><p><cite>numpy</cite></p></td> |
| <td><p>>=1.21</p></td> |
| <td><p>Required for MLlib DataFrame-based API</p></td> |
| </tr> |
| </tbody> |
| </table> |
| <p>Additional libraries that enhance functionality but are not included in the installation packages:</p> |
| <ul class="simple"> |
| <li><p><strong>scipy</strong>: Required for SciPy integration.</p></li> |
| <li><p><strong>scikit-learn</strong>: Required for implementing machine learning algorithms.</p></li> |
| <li><p><strong>torch</strong>: Required for machine learning model training.</p></li> |
| <li><p><strong>torchvision</strong>: Required for supporting image and video processing.</p></li> |
| <li><p><strong>torcheval</strong>: Required for facilitating model evaluation metrics.</p></li> |
| <li><p><strong>deepspeed</strong>: Required for providing high-performance model training optimizations. Installable on non-Darwin systems.</p></li> |
| </ul> |
| </section> |
| <section id="mllib"> |
| <h4>MLlib<a class="headerlink" href="#mllib" title="Permalink to this headline">#</a></h4> |
| <p>Installable with <code class="docutils literal notranslate"><span class="pre">pip</span> <span class="pre">install</span> <span class="pre">"pyspark[mllib]"</span></code>.</p> |
| <table class="table"> |
| <colgroup> |
| <col style="width: 17%" /> |
| <col style="width: 40%" /> |
| <col style="width: 43%" /> |
| </colgroup> |
| <thead> |
| <tr class="row-odd"><th class="head"><p>Package</p></th> |
| <th class="head"><p>Supported version</p></th> |
| <th class="head"><p>Note</p></th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr class="row-even"><td><p><cite>numpy</cite></p></td> |
| <td><p>>=1.21</p></td> |
| <td><p>Required for MLlib</p></td> |
| </tr> |
| </tbody> |
| </table> |
| </section> |
| </section> |
| </section> |
| </section> |
| |
| |
| </article> |
| |
| |
| |
| <footer class="bd-footer-article"> |
| |
| <div class="footer-article-items footer-article__inner"> |
| |
| <div class="footer-article-item"><!-- Previous / next buttons --> |
| <div class="prev-next-area"> |
| <a class="left-prev" |
| href="index.html" |
| title="previous page"> |
| <i class="fa-solid fa-angle-left"></i> |
| <div class="prev-next-info"> |
| <p class="prev-next-subtitle">previous</p> |
| <p class="prev-next-title">Getting Started</p> |
| </div> |
| </a> |
| <a class="right-next" |
| href="quickstart_df.html" |
| title="next page"> |
| <div class="prev-next-info"> |
| <p class="prev-next-subtitle">next</p> |
| <p class="prev-next-title">Quickstart: DataFrame</p> |
| </div> |
| <i class="fa-solid fa-angle-right"></i> |
| </a> |
| </div></div> |
| |
| </div> |
| |
| </footer> |
| |
| </div> |
| |
| |
| |
| <div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner"> |
| |
| <div class="sidebar-secondary-item"> |
| <div class="page-toc tocsection onthispage"> |
| <i class="fa-solid fa-list"></i> On this page |
| </div> |
| <nav class="bd-toc-nav page-toc"> |
| <ul class="visible nav section-nav flex-column"> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#python-versions-supported">Python Versions Supported</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#using-pypi">Using PyPI</a><ul class="nav section-nav flex-column"> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#python-spark-connect-client">Python Spark Connect Client</a></li> |
| </ul> |
| </li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#using-conda">Using Conda</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#manually-downloading">Manually Downloading</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#installing-from-source">Installing from Source</a></li> |
| <li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#dependencies">Dependencies</a><ul class="nav section-nav flex-column"> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#required-dependencies">Required dependencies</a></li> |
| <li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#optional-dependencies">Optional dependencies</a><ul class="nav section-nav flex-column"> |
| <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#spark-connect">Spark Connect</a></li> |
| <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#spark-sql">Spark SQL</a></li> |
| <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#pandas-api-on-spark">Pandas API on Spark</a></li> |
| <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#mllib-dataframe-based-api">MLlib DataFrame-based API</a></li> |
| <li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#mllib">MLlib</a></li> |
| </ul> |
| </li> |
| </ul> |
| </li> |
| </ul> |
| </nav></div> |
| |
| <div class="sidebar-secondary-item"> |
| <div class="tocsection sourcelink"> |
| <a href="../_sources/getting_started/install.rst.txt"> |
| <i class="fa-solid fa-file-lines"></i> Show Source |
| </a> |
| </div> |
| </div> |
| |
| </div></div> |
| |
| |
| </div> |
| <footer class="bd-footer-content"> |
| |
| </footer> |
| |
| </main> |
| </div> |
| </div> |
| |
| <!-- Scripts loaded after <body> so the DOM is not blocked --> |
| <script src="../_static/scripts/bootstrap.js?digest=e353d410970836974a52"></script> |
| <script src="../_static/scripts/pydata-sphinx-theme.js?digest=e353d410970836974a52"></script> |
| |
| <footer class="bd-footer"> |
| <div class="bd-footer__inner bd-page-width"> |
| |
| <div class="footer-items__start"> |
| |
| <div class="footer-item"><p class="copyright"> |
| Copyright @ 2024 The Apache Software Foundation, Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>. |
| </p></div> |
| |
| <div class="footer-item"> |
| <p class="sphinx-version"> |
| Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 4.5.0. |
| <br/> |
| </p> |
| </div> |
| |
| </div> |
| |
| |
| <div class="footer-items__end"> |
| |
| <div class="footer-item"><p class="theme-version"> |
| Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.13.3. |
| </p></div> |
| |
| </div> |
| |
| </div> |
| |
| </footer> |
| </body> |
| </html> |