| |
| |
| |
| |
| <!DOCTYPE html> |
| <html class="no-js"> |
| <head> |
| <meta charset="utf-8"> |
| <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| |
| <title>Spark Connect Overview - Spark 4.0.1 Documentation</title> |
| |
| |
| |
| |
| |
| <link rel="stylesheet" href="css/bootstrap.min.css"> |
| <link rel="preconnect" href="https://fonts.googleapis.com"> |
| <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin> |
| <link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,wght@0,400;0,500;0,700;1,400;1,500;1,700&Courier+Prime:wght@400;700&display=swap" rel="stylesheet"> |
| <link href="css/custom.css" rel="stylesheet"> |
| <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script> |
| |
| <link rel="stylesheet" href="css/pygments-default.css"> |
| <link rel="stylesheet" href="css/docsearch.min.css" /> |
| <link rel="stylesheet" href="css/docsearch.css"> |
| |
| |
| <!-- Matomo --> |
| <script> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| |
| </head> |
| <body class="global"> |
| <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html --> |
| <nav class="navbar navbar-expand-lg navbar-dark p-0 px-4 fixed-top" style="background: #1d6890;" id="topbar"> |
| <div class="navbar-brand"><a href="index.html"> |
| <img src="https://spark.apache.org/images/spark-logo-rev.svg" width="141" height="72"/></a><span class="version">4.0.1</span> |
| </div> |
| <button class="navbar-toggler" type="button" data-toggle="collapse" |
| data-target="#navbarCollapse" aria-controls="navbarCollapse" |
| aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| <div class="collapse navbar-collapse" id="navbarCollapse"> |
| <ul class="navbar-nav me-auto"> |
| <li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a> |
| <div class="dropdown-menu" aria-labelledby="navbarQuickStart"> |
| <a class="dropdown-item" href="quick-start.html">Quick Start</a> |
| <a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a> |
| <a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a> |
| <a class="dropdown-item" href="streaming/index.html">Structured Streaming</a> |
| <a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a> |
| <a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a> |
| <a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a> |
| <a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a> |
| <a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a> |
| <div class="dropdown-menu" aria-labelledby="navbarAPIDocs"> |
| <a class="dropdown-item" href="api/python/index.html">Python</a> |
| <a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a> |
| <a class="dropdown-item" href="api/java/index.html">Java</a> |
| <a class="dropdown-item" href="api/R/index.html">R</a> |
| <a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a> |
| <div class="dropdown-menu" aria-labelledby="navbarDeploying"> |
| <a class="dropdown-item" href="cluster-overview.html">Overview</a> |
| <a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a> |
| <div class="dropdown-divider"></div> |
| <a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a> |
| <a class="dropdown-item" href="running-on-yarn.html">YARN</a> |
| <a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a> |
| <div class="dropdown-menu" aria-labelledby="navbarMore"> |
| <a class="dropdown-item" href="configuration.html">Configuration</a> |
| <a class="dropdown-item" href="monitoring.html">Monitoring</a> |
| <a class="dropdown-item" href="tuning.html">Tuning Guide</a> |
| <a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a> |
| <a class="dropdown-item" href="security.html">Security</a> |
| <a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a> |
| <a class="dropdown-item" href="migration-guide.html">Migration Guide</a> |
| <div class="dropdown-divider"></div> |
| <a class="dropdown-item" href="building-spark.html">Building Spark</a> |
| <a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a> |
| <a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a> |
| </div> |
| </li> |
| |
| <li class="nav-item"> |
| <input type="text" id="docsearch-input" placeholder="Search the docs…"> |
| </li> |
| </ul> |
| <!--<span class="navbar-text navbar-right"><span class="version-text">v4.0.1</span></span>--> |
| </div> |
| </nav> |
| |
| |
| |
| <div class="container"> |
| |
| <div class="content mr-3" id="content"> |
| |
| |
| <h1 class="title">Spark Connect Overview</h1> |
| |
| |
| <p><strong>Building client-side Spark applications</strong></p> |
| |
| <p>In Apache Spark 3.4, Spark Connect introduced a decoupled client-server |
| architecture that allows remote connectivity to Spark clusters using the |
| DataFrame API and unresolved logical plans as the protocol. The separation |
| between client and server allows Spark and its open ecosystem to be |
| leveraged from everywhere. It can be embedded in modern data applications, |
| in IDEs, Notebooks and programming languages.</p> |
| |
| <p>To get started, see <a href="api/python/getting_started/quickstart_connect.html">Quickstart: Spark Connect</a>.</p> |
| |
| <p style="text-align: center;"> |
| <img src="img/spark-connect-api.png" title="Spark Connect API" alt="Spark Connect API Diagram" /> |
| </p> |
| |
| <h1 id="how-spark-connect-works">How Spark Connect works</h1> |
| |
| <p>The Spark Connect client library is designed to simplify Spark application |
| development. It is a thin API that can be embedded everywhere: in application |
| servers, IDEs, notebooks, and programming languages. The Spark Connect API |
| builds on Spark’s DataFrame API using unresolved logical plans as a |
| language-agnostic protocol between the client and the Spark driver.</p> |
| |
| <p>The Spark Connect client translates DataFrame operations into unresolved |
| logical query plans which are encoded using protocol buffers. These are sent |
| to the server using the gRPC framework.</p> |
| |
| <p>The Spark Connect endpoint embedded on the Spark Server receives and |
| translates unresolved logical plans into Spark’s logical plan operators. |
| This is similar to parsing a SQL query, where attributes and relations are |
| parsed and an initial parse plan is built. From there, the standard Spark |
| execution process kicks in, ensuring that Spark Connect leverages all of |
| Spark’s optimizations and enhancements. Results are streamed back to the |
| client through gRPC as Apache Arrow-encoded row batches.</p> |
| |
| <p style="text-align: center;"> |
| <img src="img/spark-connect-communication.png" title="Spark Connect communication" alt="Spark Connect communication" /> |
| </p> |
| |
| <h2 id="how-spark-connect-client-applications-differ-from-classic-spark-applications">How Spark Connect client applications differ from classic Spark applications</h2> |
| |
| <p>One of the main design goals of Spark Connect is to enable a full separation and |
| isolation of the client from the server. As a consequence, there are some changes |
| that developers need to be aware of when using Spark Connect:</p> |
| |
| <ol> |
| <li>The client does not run in the same process as the Spark driver. This means that |
| the client cannot directly access and interact with the driver JVM to manipulate |
| the execution environment. In particular, in PySpark, the client does not use Py4J |
| and thus the accessing the private fields holding the JVM implementation of <code class="language-plaintext highlighter-rouge">DataFrame</code>, |
| <code class="language-plaintext highlighter-rouge">Column</code>, <code class="language-plaintext highlighter-rouge">SparkSession</code>, etc. is not possible (e.g. <code class="language-plaintext highlighter-rouge">df._jdf</code>).</li> |
| <li>By design, the Spark Connect protocol uses Sparks logical |
| plans as the abstraction to be able to declaratively describe the operations to be executed |
| on the server. Consequently, the Spark Connect protocol does not support all the |
| execution APIs of Spark, most importantly RDDs.</li> |
| <li>Spark Connect provides a session-based client for its consumers. This means that the |
| client does not have access to properties of the cluster that manipulate the |
| environment for all connected clients. Most importantly, the client does not have access |
| to the static Spark configuration or the SparkContext.</li> |
| </ol> |
| |
| <h1 id="operational-benefits-of-spark-connect">Operational benefits of Spark Connect</h1> |
| |
| <p>With this new architecture, Spark Connect mitigates several multi-tenant |
| operational issues:</p> |
| |
| <p><strong>Stability</strong>: Applications that use too much memory will now only impact their |
| own environment as they can run in their own processes. Users can define their |
| own dependencies on the client and don’t need to worry about potential conflicts |
| with the Spark driver.</p> |
| |
| <p><strong>Upgradability</strong>: The Spark driver can now seamlessly be upgraded independently |
| of applications, for example to benefit from performance improvements and security fixes. |
| This means applications can be forward-compatible, as long as the server-side RPC |
| definitions are designed to be backwards compatible.</p> |
| |
| <p><strong>Debuggability and observability</strong>: Spark Connect enables interactive debugging |
| during development directly from your favorite IDE. Similarly, applications can |
| be monitored using the application’s framework native metrics and logging libraries.</p> |
| |
| <h1 id="how-to-use-spark-connect">How to use Spark Connect</h1> |
| |
| <p>Spark Connect is available and supports PySpark and Scala |
| applications. We will walk through how to run an Apache Spark server with Spark |
| Connect and connect to it from a client application using the Spark Connect client |
| library.</p> |
| |
| <h2 id="download-and-start-spark-server-with-spark-connect">Download and start Spark server with Spark Connect</h2> |
| |
| <p>First, download Spark from the |
| <a href="https://spark.apache.org/downloads.html">Download Apache Spark</a> page. Choose the |
| latest release in the release drop down at the top of the page. Then choose your package type, typically |
| “Pre-built for Apache Hadoop 3.3 and later”, and click the link to download.</p> |
| |
| <p>Now extract the Spark package you just downloaded on your computer, for example:</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash"><span class="nb">tar</span> <span class="nt">-xvf</span> spark-4.0.1-bin-hadoop3.tgz</code></pre></figure> |
| |
| <p>In a terminal window, go to the <code class="language-plaintext highlighter-rouge">spark</code> folder in the location where you extracted |
| Spark before and run the <code class="language-plaintext highlighter-rouge">start-connect-server.sh</code> script to start Spark server with |
| Spark Connect, like in this example:</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash">./sbin/start-connect-server.sh</code></pre></figure> |
| |
| <p>Make sure to use the same version of the package as the Spark version you |
| downloaded previously. In this example, Spark 4.0.1 with Scala 2.13.</p> |
| |
| <p>Now Spark server is running and ready to accept Spark Connect sessions from client |
| applications. In the next section we will walk through how to use Spark Connect |
| when writing client applications.</p> |
| |
| <h2 id="use-spark-connect-for-interactive-analysis">Use Spark Connect for interactive analysis</h2> |
| <div class="codetabs"> |
| |
| <div data-lang="python"> |
| <p>When creating a Spark session, you can specify that you want to use Spark Connect |
| and there are a few ways to do that outlined as follows.</p> |
| |
| <p>If you do not use one of the mechanisms outlined here, your Spark session will |
| work just like before, without leveraging Spark Connect.</p> |
| |
| <h3 id="set-sparkremote-environment-variable">Set SPARK_REMOTE environment variable</h3> |
| |
| <p>If you set the <code class="language-plaintext highlighter-rouge">SPARK_REMOTE</code> environment variable on the client machine where your |
| Spark client application is running and create a new Spark Session as in the following |
| example, the session will be a Spark Connect session. With this approach, there is no |
| code change needed to start using Spark Connect.</p> |
| |
| <p>In a terminal window, set the <code class="language-plaintext highlighter-rouge">SPARK_REMOTE</code> environment variable to point to the |
| local Spark server you started previously on your computer:</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash"><span class="nb">export </span><span class="nv">SPARK_REMOTE</span><span class="o">=</span><span class="s2">"sc://localhost"</span></code></pre></figure> |
| |
| <p>And start the Spark shell as usual:</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash">./bin/pyspark</code></pre></figure> |
| |
| <p>The PySpark shell is now connected to Spark using Spark Connect as indicated in the welcome message:</p> |
| |
| <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">Client</span> <span class="n">connected</span> <span class="n">to</span> <span class="n">the</span> <span class="n">Spark</span> <span class="n">Connect</span> <span class="n">server</span> <span class="n">at</span> <span class="n">localhost</span></code></pre></figure> |
| |
| <h3 id="specify-spark-connect-when-creating-spark-session">Specify Spark Connect when creating Spark session</h3> |
| |
| <p>You can also specify that you want to use Spark Connect explicitly when you |
| create a Spark session.</p> |
| |
| <p>For example, you can launch the PySpark shell with Spark Connect as |
| illustrated here.</p> |
| |
| <p>To launch the PySpark shell with Spark Connect, simply include the <code class="language-plaintext highlighter-rouge">remote</code> |
| parameter and specify the location of your Spark server. We are using <code class="language-plaintext highlighter-rouge">localhost</code> |
| in this example to connect to the local Spark server we started previously:</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash">./bin/pyspark <span class="nt">--remote</span> <span class="s2">"sc://localhost"</span></code></pre></figure> |
| |
| <p>And you will notice that the PySpark shell welcome message tells you that |
| you have connected to Spark using Spark Connect:</p> |
| |
| <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">Client</span> <span class="n">connected</span> <span class="n">to</span> <span class="n">the</span> <span class="n">Spark</span> <span class="n">Connect</span> <span class="n">server</span> <span class="n">at</span> <span class="n">localhost</span></code></pre></figure> |
| |
| <p>You can also check the Spark session type. If it includes <code class="language-plaintext highlighter-rouge">.connect.</code> you |
| are using Spark Connect as shown in this example:</p> |
| |
| <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">SparkSession</span> <span class="n">available</span> <span class="k">as</span> <span class="sh">'</span><span class="s">spark</span><span class="sh">'</span><span class="p">.</span> |
| <span class="o">>>></span> <span class="nf">type</span><span class="p">(</span><span class="n">spark</span><span class="p">)</span> |
| <span class="o"><</span><span class="k">class</span> <span class="err">'</span><span class="nc">pyspark</span><span class="p">.</span><span class="n">sql</span><span class="p">.</span><span class="n">connect</span><span class="p">.</span><span class="n">session</span><span class="p">.</span><span class="n">SparkSession</span><span class="sh">'</span><span class="s">></span></code></pre></figure> |
| |
| <p>Now you can run PySpark code in the shell to see Spark Connect in action:</p> |
| |
| <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="o">>>></span> <span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="sh">"</span><span class="s">id</span><span class="sh">"</span><span class="p">,</span> <span class="sh">"</span><span class="s">name</span><span class="sh">"</span><span class="p">]</span> |
| <span class="o">>>></span> <span class="n">data</span> <span class="o">=</span> <span class="p">[(</span><span class="mi">1</span><span class="p">,</span><span class="sh">"</span><span class="s">Sarah</span><span class="sh">"</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span><span class="sh">"</span><span class="s">Maria</span><span class="sh">"</span><span class="p">)]</span> |
| <span class="o">>>></span> <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="nf">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">).</span><span class="nf">toDF</span><span class="p">(</span><span class="o">*</span><span class="n">columns</span><span class="p">)</span> |
| <span class="o">>>></span> <span class="n">df</span><span class="p">.</span><span class="nf">show</span><span class="p">()</span> |
| <span class="o">+---+-----+</span> |
| <span class="o">|</span> <span class="nb">id</span><span class="o">|</span> <span class="n">name</span><span class="o">|</span> |
| <span class="o">+---+-----+</span> |
| <span class="o">|</span> <span class="mi">1</span><span class="o">|</span><span class="n">Sarah</span><span class="o">|</span> |
| <span class="o">|</span> <span class="mi">2</span><span class="o">|</span><span class="n">Maria</span><span class="o">|</span> |
| <span class="o">+---+-----+</span></code></pre></figure> |
| |
| </div> |
| |
| <div data-lang="scala"> |
| <p>For the Scala shell, we use an Ammonite-based REPL. Otherwise, very similar with PySpark shell.</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash">./bin/spark-shell <span class="nt">--remote</span> <span class="s2">"sc://localhost"</span></code></pre></figure> |
| |
| <p>A greeting message will appear when the REPL successfully initializes:</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash">Welcome to |
| ____ __ |
| / __/__ ___ _____/ /__ |
| _<span class="se">\ \/</span> _ <span class="se">\/</span> _ <span class="sb">`</span>/ __/ <span class="s1">'_/ |
| /___/ .__/\_,_/_/ /_/\_\ version 4.0.0-SNAPSHOT |
| /_/ |
| |
| Type in expressions to have them evaluated. |
| Spark session available as '</span>spark<span class="s1">'.</span></code></pre></figure> |
| |
| <p>By default, the REPL will attempt to connect to a local Spark Server. |
| Run the following Scala code in the shell to see Spark Connect in action:</p> |
| |
| <figure class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">@</span> <span class="nv">spark</span><span class="o">.</span><span class="py">range</span><span class="o">(</span><span class="mi">10</span><span class="o">).</span><span class="py">count</span> |
| <span class="n">res0</span><span class="k">:</span> <span class="kt">Long</span> <span class="o">=</span> <span class="mi">10L</span></code></pre></figure> |
| |
| <h3 id="configure-client-server-connection">Configure client-server connection</h3> |
| |
| <p>By default, the REPL will attempt to connect to a local Spark Server on port 15002. |
| The connection, however, may be configured in several ways as described in this configuration |
| <a href="https://github.com/apache/spark/blob/master/sql/connect/docs/client-connection-string.md">reference</a>.</p> |
| |
| <h4 id="set-sparkremote-environment-variable-1">Set SPARK_REMOTE environment variable</h4> |
| |
| <p>The SPARK_REMOTE environment variable can be set on the client machine to customize the client-server |
| connection that is initialized at REPL startup.</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash"><span class="nb">export </span><span class="nv">SPARK_REMOTE</span><span class="o">=</span><span class="s2">"sc://myhost.com:443/;token=ABCDEFG"</span> |
| ./bin/spark-shell</code></pre></figure> |
| |
| <p>or</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash"><span class="nv">SPARK_REMOTE</span><span class="o">=</span><span class="s2">"sc://myhost.com:443/;token=ABCDEFG"</span> spark-connect-repl</code></pre></figure> |
| |
| <h4 id="configure-programmatically-with-a-connection-string">Configure programmatically with a connection string</h4> |
| |
| <p>The connection may also be programmatically created using <em>SparkSession#builder</em> as in this example:</p> |
| |
| <figure class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">@</span> <span class="k">import</span> <span class="nn">org.apache.spark.sql.SparkSession</span> |
| <span class="k">@</span> <span class="k">val</span> <span class="nv">spark</span> <span class="k">=</span> <span class="nv">SparkSession</span><span class="o">.</span><span class="py">builder</span><span class="o">.</span><span class="py">remote</span><span class="o">(</span><span class="s">"sc://localhost:443/;token=ABCDEFG"</span><span class="o">).</span><span class="py">getOrCreate</span><span class="o">()</span></code></pre></figure> |
| |
| </div> |
| </div> |
| |
| <h2 id="use-spark-connect-in-standalone-applications">Use Spark Connect in standalone applications</h2> |
| |
| <div class="codetabs"> |
| |
| |
| <div data-lang="python"> |
| |
| <p>First, install PySpark with <code class="language-plaintext highlighter-rouge">pip install pyspark[connect]==4.0.1</code> or if building a packaged PySpark application/library, |
| add it your setup.py file as:</p> |
| |
| <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">install_requires</span><span class="o">=</span><span class="p">[</span> |
| <span class="sh">'</span><span class="s">pyspark[connect]==4.0.1</span><span class="sh">'</span> |
| <span class="p">]</span></code></pre></figure> |
| |
| <p>When writing your own code, include the <code class="language-plaintext highlighter-rouge">remote</code> function with a reference to |
| your Spark server when you create a Spark session, as in this example:</p> |
| |
| <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="n">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| <span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="p">.</span><span class="n">builder</span><span class="p">.</span><span class="nf">remote</span><span class="p">(</span><span class="sh">"</span><span class="s">sc://localhost</span><span class="sh">"</span><span class="p">).</span><span class="nf">getOrCreate</span><span class="p">()</span></code></pre></figure> |
| |
| <p>For illustration purposes, we’ll create a simple Spark Connect application, SimpleApp.py:</p> |
| |
| <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="sh">"""</span><span class="s">SimpleApp.py</span><span class="sh">"""</span> |
| <span class="kn">from</span> <span class="n">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| |
| <span class="n">logFile</span> <span class="o">=</span> <span class="sh">"</span><span class="s">YOUR_SPARK_HOME/README.md</span><span class="sh">"</span> <span class="c1"># Should be some file on your system |
| </span><span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="p">.</span><span class="n">builder</span><span class="p">.</span><span class="nf">remote</span><span class="p">(</span><span class="sh">"</span><span class="s">sc://localhost</span><span class="sh">"</span><span class="p">).</span><span class="nf">appName</span><span class="p">(</span><span class="sh">"</span><span class="s">SimpleApp</span><span class="sh">"</span><span class="p">).</span><span class="nf">getOrCreate</span><span class="p">()</span> |
| <span class="n">logData</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="nf">text</span><span class="p">(</span><span class="n">logFile</span><span class="p">).</span><span class="nf">cache</span><span class="p">()</span> |
| |
| <span class="n">numAs</span> <span class="o">=</span> <span class="n">logData</span><span class="p">.</span><span class="nf">filter</span><span class="p">(</span><span class="n">logData</span><span class="p">.</span><span class="n">value</span><span class="p">.</span><span class="nf">contains</span><span class="p">(</span><span class="sh">'</span><span class="s">a</span><span class="sh">'</span><span class="p">)).</span><span class="nf">count</span><span class="p">()</span> |
| <span class="n">numBs</span> <span class="o">=</span> <span class="n">logData</span><span class="p">.</span><span class="nf">filter</span><span class="p">(</span><span class="n">logData</span><span class="p">.</span><span class="n">value</span><span class="p">.</span><span class="nf">contains</span><span class="p">(</span><span class="sh">'</span><span class="s">b</span><span class="sh">'</span><span class="p">)).</span><span class="nf">count</span><span class="p">()</span> |
| |
| <span class="nf">print</span><span class="p">(</span><span class="sh">"</span><span class="s">Lines with a: %i, lines with b: %i</span><span class="sh">"</span> <span class="o">%</span> <span class="p">(</span><span class="n">numAs</span><span class="p">,</span> <span class="n">numBs</span><span class="p">))</span> |
| |
| <span class="n">spark</span><span class="p">.</span><span class="nf">stop</span><span class="p">()</span></code></pre></figure> |
| |
| <p>This program just counts the number of lines containing ‘a’ and the number containing ‘b’ in a text file. |
| Note that you’ll need to replace YOUR_SPARK_HOME with the location where Spark is installed.</p> |
| |
| <p>We can run this application with the regular Python interpreter as follows:</p> |
| |
| <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="c1"># Use the Python interpreter to run your application |
| </span><span class="err">$</span> <span class="n">python</span> <span class="n">SimpleApp</span><span class="p">.</span><span class="n">py</span> |
| <span class="bp">...</span> |
| <span class="n">Lines</span> <span class="k">with</span> <span class="n">a</span><span class="p">:</span> <span class="mi">72</span><span class="p">,</span> <span class="n">lines</span> <span class="k">with</span> <span class="n">b</span><span class="p">:</span> <span class="mi">39</span></code></pre></figure> |
| |
| </div> |
| |
| |
| <div data-lang="scala"> |
| <p>To use Spark Connect as part of a Scala application/project, we first need to include the right dependencies. |
| Using the <code class="language-plaintext highlighter-rouge">sbt</code> build system as an example, we add the following dependencies to the <code class="language-plaintext highlighter-rouge">build.sbt</code> file:</p> |
| |
| <figure class="highlight"><pre><code class="language-sbt" data-lang="sbt">libraryDependencies += "org.apache.spark" %% "spark-connect-client-jvm" % "4.0.1"</code></pre></figure> |
| |
| <p>When writing your own code, include the <code class="language-plaintext highlighter-rouge">remote</code> function with a reference to |
| your Spark server when you create a Spark session, as in this example:</p> |
| |
| <figure class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.sql.SparkSession</span> |
| <span class="k">val</span> <span class="nv">spark</span> <span class="k">=</span> <span class="nv">SparkSession</span><span class="o">.</span><span class="py">builder</span><span class="o">().</span><span class="py">remote</span><span class="o">(</span><span class="s">"sc://localhost"</span><span class="o">).</span><span class="py">getOrCreate</span><span class="o">()</span></code></pre></figure> |
| |
| <p><strong>Note</strong>: Operations that reference User Defined Code such as UDFs, filter, map, etc require a |
| <a href="https://github.com/apache/spark/blob/master/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/ClassFinder.scala">ClassFinder</a> |
| to be registered to pickup and upload any required classfiles. Also, any JAR dependencies must be uploaded to the server using <code class="language-plaintext highlighter-rouge">SparkSession#AddArtifact</code>.</p> |
| |
| <p>Example:</p> |
| |
| <figure class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.sql.connect.client.REPLClassDirMonitor</span> |
| <span class="c1">// Register a ClassFinder to monitor and upload the classfiles from the build output.</span> |
| <span class="k">val</span> <span class="nv">classFinder</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">REPLClassDirMonitor</span><span class="o">(<</span><span class="nc">ABSOLUTE_PATH_TO_BUILD_OUTPUT_DIR</span><span class="o">>)</span> |
| <span class="nv">spark</span><span class="o">.</span><span class="py">registerClassFinder</span><span class="o">(</span><span class="n">classFinder</span><span class="o">)</span> |
| |
| <span class="c1">// Upload JAR dependencies</span> |
| <span class="nv">spark</span><span class="o">.</span><span class="py">addArtifact</span><span class="o">(<</span><span class="nc">ABSOLUTE_PATH_JAR_DEP</span><span class="o">>)</span></code></pre></figure> |
| |
| <p>Here, <code class="language-plaintext highlighter-rouge">ABSOLUTE_PATH_TO_BUILD_OUTPUT_DIR</code> is the output directory where the build system writes classfiles into |
| and <code class="language-plaintext highlighter-rouge">ABSOLUTE_PATH_JAR_DEP</code> is the location of the JAR on the local file system.</p> |
| |
| <p>The <code class="language-plaintext highlighter-rouge">REPLClassDirMonitor</code> is a provided implementation of <code class="language-plaintext highlighter-rouge">ClassFinder</code> that monitors a specific directory but |
| one may implement their own class extending <code class="language-plaintext highlighter-rouge">ClassFinder</code> for customized search and monitoring.</p> |
| |
| </div> |
| </div> |
| |
| <p>For more information on application development with Spark Connect as well as extending Spark Connect |
| with custom functionality, see <a href="app-dev-spark-connect.html">Application Development with Spark Connect</a>.</p> |
| <h1 id="client-application-authentication">Client application authentication</h1> |
| |
| <p>While Spark Connect does not have built-in authentication, it is designed to |
| work seamlessly with your existing authentication infrastructure. Its gRPC |
| HTTP/2 interface allows for the use of authenticating proxies, which makes |
| it possible to secure Spark Connect without having to implement authentication |
| logic in Spark directly.</p> |
| |
| <h1 id="what-is-supported">What is supported</h1> |
| |
| <p><strong>PySpark</strong>: Since Spark 3.4, Spark Connect supports most PySpark APIs, including |
| <a href="api/python/reference/pyspark.sql/dataframe.html">DataFrame</a>, |
| <a href="api/python/reference/pyspark.sql/functions.html">Functions</a>, and |
| <a href="api/python/reference/pyspark.sql/column.html">Column</a>. However, |
| some APIs such as <a href="api/python/reference/api/pyspark.SparkContext.html">SparkContext</a> |
| and <a href="api/python/reference/api/pyspark.RDD.html">RDD</a> are not supported. |
| You can check which APIs are currently |
| supported in the <a href="api/python/reference/index.html">API reference</a> documentation. |
| Supported APIs are labeled “Supports Spark Connect” so you can check whether the |
| APIs you are using are available before migrating existing code to Spark Connect.</p> |
| |
| <p><strong>Scala</strong>: Since Spark 3.5, Spark Connect supports most Scala APIs, including |
| <a href="api/scala/org/apache/spark/sql/Dataset.html">Dataset</a>, |
| <a href="api/scala/org/apache/spark/sql/functions$.html">functions</a>, |
| <a href="api/scala/org/apache/spark/sql/Column.html">Column</a>, |
| <a href="api/scala/org/apache/spark/sql/catalog/Catalog.html">Catalog</a> and |
| <a href="api/scala/org/apache/spark/sql/KeyValueGroupedDataset.html">KeyValueGroupedDataset</a>.</p> |
| |
| <p>User-Defined Functions (UDFs) are supported, by default for the shell and in standalone applications with |
| additional set-up requirements.</p> |
| |
| <p>Majority of the Streaming API is supported, including |
| <a href="api/scala/org/apache/spark/sql/streaming/DataStreamReader.html">DataStreamReader</a>, |
| <a href="api/scala/org/apache/spark/sql/streaming/DataStreamWriter.htmll">DataStreamWriter</a>, |
| <a href="api/scala/org/apache/spark/sql/streaming/StreamingQuery.html">StreamingQuery</a> and |
| <a href="api/scala/org/apache/spark/sql/streaming/StreamingQueryListener.html">StreamingQueryListener</a>.</p> |
| |
| <p>APIs such as <a href="api/scala/org/apache/spark/SparkContext.html">SparkContext</a> |
| and <a href="api/scala/org/apache/spark/rdd/RDD.html">RDD</a> are unsupported in Spark Connect.</p> |
| |
| <p>Support for more APIs is planned for upcoming Spark releases.</p> |
| |
| </div> |
| |
| <!-- /container --> |
| </div> |
| |
| <script src="js/vendor/jquery-3.5.1.min.js"></script> |
| <script src="js/vendor/bootstrap.bundle.min.js"></script> |
| |
| <script src="js/vendor/anchor.min.js"></script> |
| <script src="js/main.js"></script> |
| |
| <script type="text/javascript" src="js/vendor/docsearch.min.js"></script> |
| <script type="text/javascript"> |
| // DocSearch is entirely free and automated. DocSearch is built in two parts: |
| // 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link |
| // in your website and extract content from every page it traverses. It then pushes this |
| // content to an Algolia index. |
| // 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index |
| // to your search input and display its results in a dropdown UI. If you want to find more |
| // details on how works DocSearch, check the docs of DocSearch. |
| docsearch({ |
| apiKey: 'd62f962a82bc9abb53471cb7b89da35e', |
| appId: 'RAI69RXRSK', |
| indexName: 'apache_spark', |
| inputSelector: '#docsearch-input', |
| enhancedSearchInput: true, |
| algoliaOptions: { |
| 'facetFilters': ["version:4.0.1"] |
| }, |
| debug: false // Set debug to true if you want to inspect the dropdown |
| }); |
| |
| </script> |
| |
| <!-- MathJax Section --> |
| <script type="text/x-mathjax-config"> |
| MathJax.Hub.Config({ |
| TeX: { equationNumbers: { autoNumber: "AMS" } } |
| }); |
| </script> |
| <script> |
| // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS. |
| // We could use "//cdn.mathjax...", but that won't support "file://". |
| (function(d, script) { |
| script = d.createElement('script'); |
| script.type = 'text/javascript'; |
| script.async = true; |
| script.onload = function(){ |
| MathJax.Hub.Config({ |
| tex2jax: { |
| inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ], |
| displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], |
| processEscapes: true, |
| skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] |
| } |
| }); |
| }; |
| script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + |
| 'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' + |
| '?config=TeX-AMS-MML_HTMLorMML'; |
| d.getElementsByTagName('head')[0].appendChild(script); |
| }(document)); |
| </script> |
| </body> |
| </html> |