| |
| <!DOCTYPE html> |
| <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]--> |
| <!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]--> |
| <!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]--> |
| <!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]--> |
| <head> |
| <meta charset="utf-8"> |
| <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| |
| <title>Spark Connect Overview - Spark 3.5.3 Documentation</title> |
| |
| |
| |
| |
| |
| <link rel="stylesheet" href="css/bootstrap.min.css"> |
| <link rel="preconnect" href="https://fonts.googleapis.com"> |
| <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin> |
| <link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,wght@0,400;0,500;0,700;1,400;1,500;1,700&Courier+Prime:wght@400;700&display=swap" rel="stylesheet"> |
| <link href="css/custom.css" rel="stylesheet"> |
| <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script> |
| |
| <link rel="stylesheet" href="css/pygments-default.css"> |
| <link rel="stylesheet" href="css/docsearch.min.css" /> |
| <link rel="stylesheet" href="css/docsearch.css"> |
| |
| |
| <!-- Matomo --> |
| <script> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| |
| </head> |
| <body class="global"> |
| <!--[if lt IE 7]> |
| <p class="chromeframe">You are using an outdated browser. <a href="https://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p> |
| <![endif]--> |
| |
| <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html --> |
| |
| <nav class="navbar navbar-expand-lg navbar-dark p-0 px-4 fixed-top" style="background: #1d6890;" id="topbar"> |
| <div class="navbar-brand"><a href="index.html"> |
| <img src="img/spark-logo-rev.svg" width="141" height="72"/></a><span class="version">3.5.3</span> |
| </div> |
| <button class="navbar-toggler" type="button" data-toggle="collapse" |
| data-target="#navbarCollapse" aria-controls="navbarCollapse" |
| aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| <div class="collapse navbar-collapse" id="navbarCollapse"> |
| <ul class="navbar-nav me-auto"> |
| <li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a> |
| <div class="dropdown-menu" aria-labelledby="navbarQuickStart"> |
| <a class="dropdown-item" href="quick-start.html">Quick Start</a> |
| <a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a> |
| <a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a> |
| <a class="dropdown-item" href="structured-streaming-programming-guide.html">Structured Streaming</a> |
| <a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a> |
| <a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a> |
| <a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a> |
| <a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a> |
| <a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a> |
| <div class="dropdown-menu" aria-labelledby="navbarAPIDocs"> |
| <a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a> |
| <a class="dropdown-item" href="api/java/index.html">Java</a> |
| <a class="dropdown-item" href="api/python/index.html">Python</a> |
| <a class="dropdown-item" href="api/R/index.html">R</a> |
| <a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a> |
| <div class="dropdown-menu" aria-labelledby="navbarDeploying"> |
| <a class="dropdown-item" href="cluster-overview.html">Overview</a> |
| <a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a> |
| <div class="dropdown-divider"></div> |
| <a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a> |
| <a class="dropdown-item" href="running-on-mesos.html">Mesos</a> |
| <a class="dropdown-item" href="running-on-yarn.html">YARN</a> |
| <a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a> |
| <div class="dropdown-menu" aria-labelledby="navbarMore"> |
| <a class="dropdown-item" href="configuration.html">Configuration</a> |
| <a class="dropdown-item" href="monitoring.html">Monitoring</a> |
| <a class="dropdown-item" href="tuning.html">Tuning Guide</a> |
| <a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a> |
| <a class="dropdown-item" href="security.html">Security</a> |
| <a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a> |
| <a class="dropdown-item" href="migration-guide.html">Migration Guide</a> |
| <div class="dropdown-divider"></div> |
| <a class="dropdown-item" href="building-spark.html">Building Spark</a> |
| <a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a> |
| <a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a> |
| </div> |
| </li> |
| |
| <li class="nav-item"> |
| <input type="text" id="docsearch-input" placeholder="Search the docs…"> |
| </li> |
| </ul> |
| <!--<span class="navbar-text navbar-right"><span class="version-text">v3.5.3</span></span>--> |
| </div> |
| </nav> |
| |
| |
| |
| <div class="container"> |
| |
| |
| <div class="content mr-3" id="content"> |
| |
| |
| <h1 class="title">Spark Connect Overview</h1> |
| |
| |
| <p><strong>Building client-side Spark applications</strong></p> |
| |
| <p>In Apache Spark 3.4, Spark Connect introduced a decoupled client-server |
| architecture that allows remote connectivity to Spark clusters using the |
| DataFrame API and unresolved logical plans as the protocol. The separation |
| between client and server allows Spark and its open ecosystem to be |
| leveraged from everywhere. It can be embedded in modern data applications, |
| in IDEs, Notebooks and programming languages.</p> |
| |
| <p>To get started, see <a href="api/python/getting_started/quickstart_connect.html">Quickstart: Spark Connect</a>.</p> |
| |
| <p style="text-align: center;"> |
| <img src="img/spark-connect-api.png" title="Spark Connect API" alt="Spark Connect API Diagram" /> |
| </p> |
| |
| <h1 id="how-spark-connect-works">How Spark Connect works</h1> |
| |
| <p>The Spark Connect client library is designed to simplify Spark application |
| development. It is a thin API that can be embedded everywhere: in application |
| servers, IDEs, notebooks, and programming languages. The Spark Connect API |
| builds on Spark’s DataFrame API using unresolved logical plans as a |
| language-agnostic protocol between the client and the Spark driver.</p> |
| |
| <p>The Spark Connect client translates DataFrame operations into unresolved |
| logical query plans which are encoded using protocol buffers. These are sent |
| to the server using the gRPC framework.</p> |
| |
| <p>The Spark Connect endpoint embedded on the Spark Server receives and |
| translates unresolved logical plans into Spark’s logical plan operators. |
| This is similar to parsing a SQL query, where attributes and relations are |
| parsed and an initial parse plan is built. From there, the standard Spark |
| execution process kicks in, ensuring that Spark Connect leverages all of |
| Spark’s optimizations and enhancements. Results are streamed back to the |
| client through gRPC as Apache Arrow-encoded row batches.</p> |
| |
| <p style="text-align: center;"> |
| <img src="img/spark-connect-communication.png" title="Spark Connect communication" alt="Spark Connect communication" /> |
| </p> |
| |
| <h1 id="operational-benefits-of-spark-connect">Operational benefits of Spark Connect</h1> |
| |
| <p>With this new architecture, Spark Connect mitigates several multi-tenant |
| operational issues:</p> |
| |
| <p><strong>Stability</strong>: Applications that use too much memory will now only impact their |
| own environment as they can run in their own processes. Users can define their |
| own dependencies on the client and don’t need to worry about potential conflicts |
| with the Spark driver.</p> |
| |
| <p><strong>Upgradability</strong>: The Spark driver can now seamlessly be upgraded independently |
| of applications, for example to benefit from performance improvements and security fixes. |
| This means applications can be forward-compatible, as long as the server-side RPC |
| definitions are designed to be backwards compatible.</p> |
| |
| <p><strong>Debuggability and observability</strong>: Spark Connect enables interactive debugging |
| during development directly from your favorite IDE. Similarly, applications can |
| be monitored using the application’s framework native metrics and logging libraries.</p> |
| |
| <h1 id="how-to-use-spark-connect">How to use Spark Connect</h1> |
| |
| <p>Starting with Spark 3.4, Spark Connect is available and supports PySpark and Scala |
| applications. We will walk through how to run an Apache Spark server with Spark |
| Connect and connect to it from a client application using the Spark Connect client |
| library.</p> |
| |
| <h2 id="download-and-start-spark-server-with-spark-connect">Download and start Spark server with Spark Connect</h2> |
| |
| <p>First, download Spark from the |
| <a href="https://spark.apache.org/downloads.html">Download Apache Spark</a> page. Spark Connect |
| was introduced in Apache Spark version 3.4 so make sure you choose 3.4.0 or newer in |
| the release drop down at the top of the page. Then choose your package type, typically |
| “Pre-built for Apache Hadoop 3.3 and later”, and click the link to download.</p> |
| |
| <p>Now extract the Spark package you just downloaded on your computer, for example:</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash"><span class="nb">tar</span> <span class="nt">-xvf</span> spark-3.5.3-bin-hadoop3.tgz</code></pre></figure> |
| |
| <p>In a terminal window, go to the <code class="language-plaintext highlighter-rouge">spark</code> folder in the location where you extracted |
| Spark before and run the <code class="language-plaintext highlighter-rouge">start-connect-server.sh</code> script to start Spark server with |
| Spark Connect, like in this example:</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash">./sbin/start-connect-server.sh <span class="nt">--packages</span> org.apache.spark:spark-connect_2.12:3.5.3</code></pre></figure> |
| |
| <p>Note that we include a Spark Connect package (<code class="language-plaintext highlighter-rouge">spark-connect_2.12:3.5.3</code>), when starting |
| Spark server. This is required to use Spark Connect. Make sure to use the same version |
| of the package as the Spark version you downloaded previously. In this example, |
| Spark 3.5.3 with Scala 2.12.</p> |
| |
| <p>Now Spark server is running and ready to accept Spark Connect sessions from client |
| applications. In the next section we will walk through how to use Spark Connect |
| when writing client applications.</p> |
| |
| <h2 id="use-spark-connect-for-interactive-analysis">Use Spark Connect for interactive analysis</h2> |
| <div class="codetabs"> |
| |
| <div data-lang="python"> |
| <p>When creating a Spark session, you can specify that you want to use Spark Connect |
| and there are a few ways to do that outlined as follows.</p> |
| |
| <p>If you do not use one of the mechanisms outlined here, your Spark session will |
| work just like before, without leveraging Spark Connect.</p> |
| |
| <h3 id="set-sparkremote-environment-variable">Set SPARK_REMOTE environment variable</h3> |
| |
| <p>If you set the <code class="language-plaintext highlighter-rouge">SPARK_REMOTE</code> environment variable on the client machine where your |
| Spark client application is running and create a new Spark Session as in the following |
| example, the session will be a Spark Connect session. With this approach, there is no |
| code change needed to start using Spark Connect.</p> |
| |
| <p>In a terminal window, set the <code class="language-plaintext highlighter-rouge">SPARK_REMOTE</code> environment variable to point to the |
| local Spark server you started previously on your computer:</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash"><span class="nb">export </span><span class="nv">SPARK_REMOTE</span><span class="o">=</span><span class="s2">"sc://localhost"</span></code></pre></figure> |
| |
| <p>And start the Spark shell as usual:</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash">./bin/pyspark</code></pre></figure> |
| |
| <p>The PySpark shell is now connected to Spark using Spark Connect as indicated in the welcome message:</p> |
| |
| <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">Client</span> <span class="n">connected</span> <span class="n">to</span> <span class="n">the</span> <span class="n">Spark</span> <span class="n">Connect</span> <span class="n">server</span> <span class="n">at</span> <span class="n">localhost</span></code></pre></figure> |
| |
| <h3 id="specify-spark-connect-when-creating-spark-session">Specify Spark Connect when creating Spark session</h3> |
| |
| <p>You can also specify that you want to use Spark Connect explicitly when you |
| create a Spark session.</p> |
| |
| <p>For example, you can launch the PySpark shell with Spark Connect as |
| illustrated here.</p> |
| |
| <p>To launch the PySpark shell with Spark Connect, simply include the <code class="language-plaintext highlighter-rouge">remote</code> |
| parameter and specify the location of your Spark server. We are using <code class="language-plaintext highlighter-rouge">localhost</code> |
| in this example to connect to the local Spark server we started previously:</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash">./bin/pyspark <span class="nt">--remote</span> <span class="s2">"sc://localhost"</span></code></pre></figure> |
| |
| <p>And you will notice that the PySpark shell welcome message tells you that |
| you have connected to Spark using Spark Connect:</p> |
| |
| <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">Client</span> <span class="n">connected</span> <span class="n">to</span> <span class="n">the</span> <span class="n">Spark</span> <span class="n">Connect</span> <span class="n">server</span> <span class="n">at</span> <span class="n">localhost</span></code></pre></figure> |
| |
| <p>You can also check the Spark session type. If it includes <code class="language-plaintext highlighter-rouge">.connect.</code> you |
| are using Spark Connect as shown in this example:</p> |
| |
| <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">SparkSession</span> <span class="n">available</span> <span class="k">as</span> <span class="s">'spark'</span><span class="p">.</span> |
| <span class="o">>>></span> <span class="nb">type</span><span class="p">(</span><span class="n">spark</span><span class="p">)</span> |
| <span class="o"><</span><span class="k">class</span> <span class="err">'</span><span class="nc">pyspark</span><span class="p">.</span><span class="n">sql</span><span class="p">.</span><span class="n">connect</span><span class="p">.</span><span class="n">session</span><span class="p">.</span><span class="n">SparkSession</span><span class="s">'></span></code></pre></figure> |
| |
| <p>Now you can run PySpark code in the shell to see Spark Connect in action:</p> |
| |
| <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="o">>>></span> <span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="s">"id"</span><span class="p">,</span><span class="s">"name"</span><span class="p">]</span> |
| <span class="o">>>></span> <span class="n">data</span> <span class="o">=</span> <span class="p">[(</span><span class="mi">1</span><span class="p">,</span><span class="s">"Sarah"</span><span class="p">),(</span><span class="mi">2</span><span class="p">,</span><span class="s">"Maria"</span><span class="p">)]</span> |
| <span class="o">>>></span> <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">).</span><span class="n">toDF</span><span class="p">(</span><span class="o">*</span><span class="n">columns</span><span class="p">)</span> |
| <span class="o">>>></span> <span class="n">df</span><span class="p">.</span><span class="n">show</span><span class="p">()</span> |
| <span class="o">+---+-----+</span> |
| <span class="o">|</span> <span class="nb">id</span><span class="o">|</span> <span class="n">name</span><span class="o">|</span> |
| <span class="o">+---+-----+</span> |
| <span class="o">|</span> <span class="mi">1</span><span class="o">|</span><span class="n">Sarah</span><span class="o">|</span> |
| <span class="o">|</span> <span class="mi">2</span><span class="o">|</span><span class="n">Maria</span><span class="o">|</span> |
| <span class="o">+---+-----+</span></code></pre></figure> |
| |
| </div> |
| |
| <div data-lang="scala"> |
| <p>For the Scala shell, we use an Ammonite-based REPL that is currently not included in the Apache Spark package.</p> |
| |
| <p>To set up the new Scala shell, first download and install <a href="https://get-coursier.io/docs/cli-installation">Coursier CLI</a>. |
| Then, install the REPL using the following command in a terminal window:</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash">cs <span class="nb">install</span> –-contrib spark-connect-repl</code></pre></figure> |
| |
| <p>And now you can start the Ammonite-based Scala REPL/shell to connect to your Spark server like this:</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash">spark-connect-repl</code></pre></figure> |
| |
| <p>A greeting message will appear when the REPL successfully initializes:</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash">Spark session available as <span class="s1">'spark'</span><span class="nb">.</span> |
| _____ __ ______ __ |
| / ___/____ ____ ______/ /__ / ____/___ ____ ____ ___ _____/ /_ |
| <span class="se">\_</span>_ <span class="se">\/</span> __ <span class="se">\/</span> __ <span class="sb">`</span>/ ___/ //_/ / / / __ <span class="se">\/</span> __ <span class="se">\/</span> __ <span class="se">\/</span> _ <span class="se">\/</span> ___/ __/ |
| ___/ / /_/ / /_/ / / / ,< / /___/ /_/ / / / / / / / __/ /__/ /_ |
| /____/ .___/<span class="se">\_</span>_,_/_/ /_/|_| <span class="se">\_</span>___/<span class="se">\_</span>___/_/ /_/_/ /_/<span class="se">\_</span>__/<span class="se">\_</span>__/<span class="se">\_</span>_/ |
| /_/</code></pre></figure> |
| |
| <p>By default, the REPL will attempt to connect to a local Spark Server. |
| Run the following Scala code in the shell to see Spark Connect in action:</p> |
| |
| <figure class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">@</span> <span class="nv">spark</span><span class="o">.</span><span class="py">range</span><span class="o">(</span><span class="mi">10</span><span class="o">).</span><span class="py">count</span> |
| <span class="n">res0</span><span class="k">:</span> <span class="kt">Long</span> <span class="o">=</span> <span class="mi">10L</span></code></pre></figure> |
| |
| <h3 id="configure-client-server-connection">Configure client-server connection</h3> |
| |
| <p>By default, the REPL will attempt to connect to a local Spark Server on port 15002. |
| The connection, however, may be configured in several ways as described in this configuration |
| <a href="https://github.com/apache/spark/blob/master/connector/connect/docs/client-connection-string.md">reference</a>.</p> |
| |
| <h4 id="set-sparkremote-environment-variable-1">Set SPARK_REMOTE environment variable</h4> |
| |
| <p>The SPARK_REMOTE environment variable can be set on the client machine to customize the client-server |
| connection that is initialized at REPL startup.</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash"><span class="nb">export </span><span class="nv">SPARK_REMOTE</span><span class="o">=</span><span class="s2">"sc://myhost.com:443/;token=ABCDEFG"</span> |
| spark-connect-repl</code></pre></figure> |
| |
| <p>or</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash"><span class="nv">SPARK_REMOTE</span><span class="o">=</span><span class="s2">"sc://myhost.com:443/;token=ABCDEFG"</span> spark-connect-repl</code></pre></figure> |
| |
| <h4 id="use-cli-arguments">Use CLI arguments</h4> |
| |
| <p>The customizations may also be passed in through CLI arguments as shown below:</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash">spark-connect-repl <span class="nt">--host</span> myhost.com <span class="nt">--port</span> 443 <span class="nt">--token</span> ABCDEFG</code></pre></figure> |
| |
| <p>The supported list of CLI arguments may be found <a href="https://github.com/apache/spark/blob/master/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkConnectClientParser.scala#L48">here</a>.</p> |
| |
| <h4 id="configure-programmatically-with-a-connection-ctring">Configure programmatically with a connection ctring</h4> |
| |
| <p>The connection may also be programmatically created using <em>SparkSession#builder</em> as in this example:</p> |
| |
| <figure class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">@</span> <span class="k">import</span> <span class="nn">org.apache.spark.sql.SparkSession</span> |
| <span class="k">@</span> <span class="k">val</span> <span class="nv">spark</span> <span class="k">=</span> <span class="nv">SparkSession</span><span class="o">.</span><span class="py">builder</span><span class="o">.</span><span class="py">remote</span><span class="o">(</span><span class="s">"sc://localhost:443/;token=ABCDEFG"</span><span class="o">).</span><span class="py">build</span><span class="o">()</span></code></pre></figure> |
| |
| </div> |
| </div> |
| |
| <h2 id="use-spark-connect-in-standalone-applications">Use Spark Connect in standalone applications</h2> |
| |
| <div class="codetabs"> |
| |
| |
| <div data-lang="python"> |
| |
| <p>First, install PySpark with <code class="language-plaintext highlighter-rouge">pip install pyspark[connect]==3.5.0</code> or if building a packaged PySpark application/library, |
| add it your setup.py file as:</p> |
| |
| <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">install_requires</span><span class="o">=</span><span class="p">[</span> |
| <span class="s">'pyspark[connect]==3.5.0'</span> |
| <span class="p">]</span></code></pre></figure> |
| |
| <p>When writing your own code, include the <code class="language-plaintext highlighter-rouge">remote</code> function with a reference to |
| your Spark server when you create a Spark session, as in this example:</p> |
| |
| <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| <span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="p">.</span><span class="n">builder</span><span class="p">.</span><span class="n">remote</span><span class="p">(</span><span class="s">"sc://localhost"</span><span class="p">).</span><span class="n">getOrCreate</span><span class="p">()</span></code></pre></figure> |
| |
| <p>For illustration purposes, we’ll create a simple Spark Connect application, SimpleApp.py:</p> |
| |
| <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="s">"""SimpleApp.py"""</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| |
| <span class="n">logFile</span> <span class="o">=</span> <span class="s">"YOUR_SPARK_HOME/README.md"</span> <span class="c1"># Should be some file on your system |
| </span><span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="p">.</span><span class="n">builder</span><span class="p">.</span><span class="n">remote</span><span class="p">(</span><span class="s">"sc://localhost"</span><span class="p">).</span><span class="n">appName</span><span class="p">(</span><span class="s">"SimpleApp"</span><span class="p">).</span><span class="n">getOrCreate</span><span class="p">()</span> |
| <span class="n">logData</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="n">text</span><span class="p">(</span><span class="n">logFile</span><span class="p">).</span><span class="n">cache</span><span class="p">()</span> |
| |
| <span class="n">numAs</span> <span class="o">=</span> <span class="n">logData</span><span class="p">.</span><span class="nb">filter</span><span class="p">(</span><span class="n">logData</span><span class="p">.</span><span class="n">value</span><span class="p">.</span><span class="n">contains</span><span class="p">(</span><span class="s">'a'</span><span class="p">)).</span><span class="n">count</span><span class="p">()</span> |
| <span class="n">numBs</span> <span class="o">=</span> <span class="n">logData</span><span class="p">.</span><span class="nb">filter</span><span class="p">(</span><span class="n">logData</span><span class="p">.</span><span class="n">value</span><span class="p">.</span><span class="n">contains</span><span class="p">(</span><span class="s">'b'</span><span class="p">)).</span><span class="n">count</span><span class="p">()</span> |
| |
| <span class="k">print</span><span class="p">(</span><span class="s">"Lines with a: %i, lines with b: %i"</span> <span class="o">%</span> <span class="p">(</span><span class="n">numAs</span><span class="p">,</span> <span class="n">numBs</span><span class="p">))</span> |
| |
| <span class="n">spark</span><span class="p">.</span><span class="n">stop</span><span class="p">()</span></code></pre></figure> |
| |
| <p>This program just counts the number of lines containing ‘a’ and the number containing ‘b’ in a text file. |
| Note that you’ll need to replace YOUR_SPARK_HOME with the location where Spark is installed.</p> |
| |
| <p>We can run this application with the regular Python interpreter as follows:</p> |
| |
| <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="c1"># Use the Python interpreter to run your application |
| </span><span class="err">$</span> <span class="n">python</span> <span class="n">SimpleApp</span><span class="p">.</span><span class="n">py</span> |
| <span class="p">...</span> |
| <span class="n">Lines</span> <span class="k">with</span> <span class="n">a</span><span class="p">:</span> <span class="mi">72</span><span class="p">,</span> <span class="n">lines</span> <span class="k">with</span> <span class="n">b</span><span class="p">:</span> <span class="mi">39</span></code></pre></figure> |
| |
| </div> |
| |
| |
| <div data-lang="scala"> |
| <p>To use Spark Connect as part of a Scala application/project, we first need to include the right dependencies. |
| Using the <code class="language-plaintext highlighter-rouge">sbt</code> build system as an example, we add the following dependencies to the <code class="language-plaintext highlighter-rouge">build.sbt</code> file:</p> |
| |
| <figure class="highlight"><pre><code class="language-sbt" data-lang="sbt">libraryDependencies += "org.apache.spark" %% "spark-sql-api" % "3.5.0" |
| libraryDependencies += "org.apache.spark" %% "spark-connect-client-jvm" % "3.5.0"</code></pre></figure> |
| |
| <p>When writing your own code, include the <code class="language-plaintext highlighter-rouge">remote</code> function with a reference to |
| your Spark server when you create a Spark session, as in this example:</p> |
| |
| <figure class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.sql.SparkSession</span> |
| <span class="k">val</span> <span class="nv">spark</span> <span class="k">=</span> <span class="nv">SparkSession</span><span class="o">.</span><span class="py">builder</span><span class="o">().</span><span class="py">remote</span><span class="o">(</span><span class="s">"sc://localhost"</span><span class="o">).</span><span class="py">build</span><span class="o">()</span></code></pre></figure> |
| |
| <p><strong>Note</strong>: Operations that reference User Defined Code such as UDFs, filter, map, etc require a |
| <a href="https://github.com/apache/spark/blob/bb41cd889efdd0602385e70b4c8f1c93740db332/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/client/ClassFinder.scala#L26">ClassFinder</a> |
| to be registered to pickup and upload any required classfiles. Also, any JAR dependencies must be uploaded to the server using <code class="language-plaintext highlighter-rouge">SparkSession#AddArtifact</code>.</p> |
| |
| <p>Example:</p> |
| |
| <figure class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.sql.connect.client.REPLClassDirMonitor</span> |
| <span class="c1">// Register a ClassFinder to monitor and upload the classfiles from the build output.</span> |
| <span class="k">val</span> <span class="nv">classFinder</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">REPLClassDirMonitor</span><span class="o">(<</span><span class="nc">ABSOLUTE_PATH_TO_BUILD_OUTPUT_DIR</span><span class="o">>)</span> |
| <span class="nv">spark</span><span class="o">.</span><span class="py">registerClassFinder</span><span class="o">(</span><span class="n">classfinder</span><span class="o">)</span> |
| |
| <span class="c1">// Upload JAR dependencies</span> |
| <span class="nv">spark</span><span class="o">.</span><span class="py">addArtifact</span><span class="o">(<</span><span class="nc">ABSOLUTE_PATH_JAR_DEP</span><span class="o">>)</span></code></pre></figure> |
| |
| <p>Here, <code class="language-plaintext highlighter-rouge">ABSOLUTE_PATH_TO_BUILD_OUTPUT_DIR</code> is the output directory where the build system writes classfiles into |
| and <code class="language-plaintext highlighter-rouge">ABSOLUTE_PATH_JAR_DEP</code> is the location of the JAR on the local file system.</p> |
| |
| <p>The <code class="language-plaintext highlighter-rouge">REPLClassDirMonitor</code> is a provided implementation of <code class="language-plaintext highlighter-rouge">ClassFinder</code> that monitors a specific directory but |
| one may implement their own class extending <code class="language-plaintext highlighter-rouge">ClassFinder</code> for customized search and monitoring.</p> |
| |
| </div> |
| </div> |
| |
| <h1 id="client-application-authentication">Client application authentication</h1> |
| |
| <p>While Spark Connect does not have built-in authentication, it is designed to |
| work seamlessly with your existing authentication infrastructure. Its gRPC |
| HTTP/2 interface allows for the use of authenticating proxies, which makes |
| it possible to secure Spark Connect without having to implement authentication |
| logic in Spark directly.</p> |
| |
| <h1 id="what-is-supported-in-spark-34">What is supported in Spark 3.4</h1> |
| |
| <p><strong>PySpark</strong>: In Spark 3.4, Spark Connect supports most PySpark APIs, including |
| <a href="api/python/reference/pyspark.sql/dataframe.html">DataFrame</a>, |
| <a href="api/python/reference/pyspark.sql/functions.html">Functions</a>, and |
| <a href="api/python/reference/pyspark.sql/column.html">Column</a>. However, |
| some APIs such as <a href="api/python/reference/api/pyspark.SparkContext.html">SparkContext</a> |
| and <a href="api/python/reference/api/pyspark.RDD.html">RDD</a> are not supported. |
| You can check which APIs are currently |
| supported in the <a href="api/python/reference/index.html">API reference</a> documentation. |
| Supported APIs are labeled “Supports Spark Connect” so you can check whether the |
| APIs you are using are available before migrating existing code to Spark Connect.</p> |
| |
| <p><strong>Scala</strong>: In Spark 3.5, Spark Connect supports most Scala APIs, including |
| <a href="api/scala/org/apache/spark/sql/Dataset.html">Dataset</a>, |
| <a href="api/scala/org/apache/spark/sql/functions$.html">functions</a>, |
| <a href="api/scala/org/apache/spark/sql/Column.html">Column</a>, |
| <a href="api/scala/org/apache/spark/sql/catalog/Catalog.html">Catalog</a> and |
| <a href="api/scala/org/apache/spark/sql/KeyValueGroupedDataset.html">KeyValueGroupedDataset</a>.</p> |
| |
| <p>User-Defined Functions (UDFs) are supported, by default for the shell and in standalone applications with |
| additional set-up requirements.</p> |
| |
| <p>Majority of the Streaming API is supported, including |
| <a href="api/scala/org/apache/spark/sql/streaming/DataStreamReader.html">DataStreamReader</a>, |
| <a href="api/scala/org/apache/spark/sql/streaming/DataStreamWriter.htmll">DataStreamWriter</a>, |
| <a href="api/scala/org/apache/spark/sql/streaming/StreamingQuery.html">StreamingQuery</a> and |
| <a href="api/scala/org/apache/spark/sql/streaming/StreamingQueryListener.html">StreamingQueryListener</a>.</p> |
| |
| <p>APIs such as <a href="api/scala/org/apache/spark/SparkContext.html">SparkContext</a> |
| and <a href="api/scala/org/apache/spark/rdd/RDD.html">RDD</a> are deprecated in all Spark Connect versions.</p> |
| |
| <p>Support for more APIs is planned for upcoming Spark releases.</p> |
| |
| </div> |
| |
| <!-- /container --> |
| </div> |
| |
| <script src="js/vendor/jquery-3.5.1.min.js"></script> |
| <script src="js/vendor/bootstrap.bundle.min.js"></script> |
| |
| <script src="js/vendor/anchor.min.js"></script> |
| <script src="js/main.js"></script> |
| |
| <script type="text/javascript" src="js/vendor/docsearch.min.js"></script> |
| <script type="text/javascript"> |
| // DocSearch is entirely free and automated. DocSearch is built in two parts: |
| // 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link |
| // in your website and extract content from every page it traverses. It then pushes this |
| // content to an Algolia index. |
| // 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index |
| // to your search input and display its results in a dropdown UI. If you want to find more |
| // details on how works DocSearch, check the docs of DocSearch. |
| docsearch({ |
| apiKey: 'd62f962a82bc9abb53471cb7b89da35e', |
| appId: 'RAI69RXRSK', |
| indexName: 'apache_spark', |
| inputSelector: '#docsearch-input', |
| enhancedSearchInput: true, |
| algoliaOptions: { |
| 'facetFilters': ["version:3.5.3"] |
| }, |
| debug: false // Set debug to true if you want to inspect the dropdown |
| }); |
| |
| </script> |
| |
| <!-- MathJax Section --> |
| <script type="text/x-mathjax-config"> |
| MathJax.Hub.Config({ |
| TeX: { equationNumbers: { autoNumber: "AMS" } } |
| }); |
| </script> |
| <script> |
| // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS. |
| // We could use "//cdn.mathjax...", but that won't support "file://". |
| (function(d, script) { |
| script = d.createElement('script'); |
| script.type = 'text/javascript'; |
| script.async = true; |
| script.onload = function(){ |
| MathJax.Hub.Config({ |
| tex2jax: { |
| inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ], |
| displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], |
| processEscapes: true, |
| skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] |
| } |
| }); |
| }; |
| script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + |
| 'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' + |
| '?config=TeX-AMS-MML_HTMLorMML'; |
| d.getElementsByTagName('head')[0].appendChild(script); |
| }(document)); |
| </script> |
| </body> |
| </html> |