blob: de40c7edb8e1d1afd944dd64db94a99555a1595d [file] [log] [blame]
<!DOCTYPE html>
<html class="no-js">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Spark Connect Overview - Spark 4.0.1 Documentation</title>
<link rel="stylesheet" href="css/bootstrap.min.css">
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,wght@0,400;0,500;0,700;1,400;1,500;1,700&Courier+Prime:wght@400;700&display=swap" rel="stylesheet">
<link href="css/custom.css" rel="stylesheet">
<script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
<link rel="stylesheet" href="css/pygments-default.css">
<link rel="stylesheet" href="css/docsearch.min.css" />
<link rel="stylesheet" href="css/docsearch.css">
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body class="global">
<!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
<nav class="navbar navbar-expand-lg navbar-dark p-0 px-4 fixed-top" style="background: #1d6890;" id="topbar">
<div class="navbar-brand"><a href="index.html">
<img src="https://spark.apache.org/images/spark-logo-rev.svg" width="141" height="72"/></a><span class="version">4.0.1</span>
</div>
<button class="navbar-toggler" type="button" data-toggle="collapse"
data-target="#navbarCollapse" aria-controls="navbarCollapse"
aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarCollapse">
<ul class="navbar-nav me-auto">
<li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a>
<div class="dropdown-menu" aria-labelledby="navbarQuickStart">
<a class="dropdown-item" href="quick-start.html">Quick Start</a>
<a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a>
<a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a>
<a class="dropdown-item" href="streaming/index.html">Structured Streaming</a>
<a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a>
<a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a>
<a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a>
<a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a>
<a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a>
<div class="dropdown-menu" aria-labelledby="navbarAPIDocs">
<a class="dropdown-item" href="api/python/index.html">Python</a>
<a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a>
<a class="dropdown-item" href="api/java/index.html">Java</a>
<a class="dropdown-item" href="api/R/index.html">R</a>
<a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a>
<div class="dropdown-menu" aria-labelledby="navbarDeploying">
<a class="dropdown-item" href="cluster-overview.html">Overview</a>
<a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a>
<a class="dropdown-item" href="running-on-yarn.html">YARN</a>
<a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a>
<div class="dropdown-menu" aria-labelledby="navbarMore">
<a class="dropdown-item" href="configuration.html">Configuration</a>
<a class="dropdown-item" href="monitoring.html">Monitoring</a>
<a class="dropdown-item" href="tuning.html">Tuning Guide</a>
<a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a>
<a class="dropdown-item" href="security.html">Security</a>
<a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a>
<a class="dropdown-item" href="migration-guide.html">Migration Guide</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="building-spark.html">Building Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a>
</div>
</li>
<li class="nav-item">
<input type="text" id="docsearch-input" placeholder="Search the docs…">
</li>
</ul>
<!--<span class="navbar-text navbar-right"><span class="version-text">v4.0.1</span></span>-->
</div>
</nav>
<div class="container">
<div class="content mr-3" id="content">
<h1 class="title">Spark Connect Overview</h1>
<p><strong>Building client-side Spark applications</strong></p>
<p>In Apache Spark 3.4, Spark Connect introduced a decoupled client-server
architecture that allows remote connectivity to Spark clusters using the
DataFrame API and unresolved logical plans as the protocol. The separation
between client and server allows Spark and its open ecosystem to be
leveraged from everywhere. It can be embedded in modern data applications,
in IDEs, Notebooks and programming languages.</p>
<p>To get started, see <a href="api/python/getting_started/quickstart_connect.html">Quickstart: Spark Connect</a>.</p>
<p style="text-align: center;">
<img src="img/spark-connect-api.png" title="Spark Connect API" alt="Spark Connect API Diagram" />
</p>
<h1 id="how-spark-connect-works">How Spark Connect works</h1>
<p>The Spark Connect client library is designed to simplify Spark application
development. It is a thin API that can be embedded everywhere: in application
servers, IDEs, notebooks, and programming languages. The Spark Connect API
builds on Spark&#8217;s DataFrame API using unresolved logical plans as a
language-agnostic protocol between the client and the Spark driver.</p>
<p>The Spark Connect client translates DataFrame operations into unresolved
logical query plans which are encoded using protocol buffers. These are sent
to the server using the gRPC framework.</p>
<p>The Spark Connect endpoint embedded on the Spark Server receives and
translates unresolved logical plans into Spark&#8217;s logical plan operators.
This is similar to parsing a SQL query, where attributes and relations are
parsed and an initial parse plan is built. From there, the standard Spark
execution process kicks in, ensuring that Spark Connect leverages all of
Spark&#8217;s optimizations and enhancements. Results are streamed back to the
client through gRPC as Apache Arrow-encoded row batches.</p>
<p style="text-align: center;">
<img src="img/spark-connect-communication.png" title="Spark Connect communication" alt="Spark Connect communication" />
</p>
<h2 id="how-spark-connect-client-applications-differ-from-classic-spark-applications">How Spark Connect client applications differ from classic Spark applications</h2>
<p>One of the main design goals of Spark Connect is to enable a full separation and
isolation of the client from the server. As a consequence, there are some changes
that developers need to be aware of when using Spark Connect:</p>
<ol>
<li>The client does not run in the same process as the Spark driver. This means that
the client cannot directly access and interact with the driver JVM to manipulate
the execution environment. In particular, in PySpark, the client does not use Py4J
and thus the accessing the private fields holding the JVM implementation of <code class="language-plaintext highlighter-rouge">DataFrame</code>,
<code class="language-plaintext highlighter-rouge">Column</code>, <code class="language-plaintext highlighter-rouge">SparkSession</code>, etc. is not possible (e.g. <code class="language-plaintext highlighter-rouge">df._jdf</code>).</li>
<li>By design, the Spark Connect protocol uses Sparks logical
plans as the abstraction to be able to declaratively describe the operations to be executed
on the server. Consequently, the Spark Connect protocol does not support all the
execution APIs of Spark, most importantly RDDs.</li>
<li>Spark Connect provides a session-based client for its consumers. This means that the
client does not have access to properties of the cluster that manipulate the
environment for all connected clients. Most importantly, the client does not have access
to the static Spark configuration or the SparkContext.</li>
</ol>
<h1 id="operational-benefits-of-spark-connect">Operational benefits of Spark Connect</h1>
<p>With this new architecture, Spark Connect mitigates several multi-tenant
operational issues:</p>
<p><strong>Stability</strong>: Applications that use too much memory will now only impact their
own environment as they can run in their own processes. Users can define their
own dependencies on the client and don&#8217;t need to worry about potential conflicts
with the Spark driver.</p>
<p><strong>Upgradability</strong>: The Spark driver can now seamlessly be upgraded independently
of applications, for example to benefit from performance improvements and security fixes.
This means applications can be forward-compatible, as long as the server-side RPC
definitions are designed to be backwards compatible.</p>
<p><strong>Debuggability and observability</strong>: Spark Connect enables interactive debugging
during development directly from your favorite IDE. Similarly, applications can
be monitored using the application&#8217;s framework native metrics and logging libraries.</p>
<h1 id="how-to-use-spark-connect">How to use Spark Connect</h1>
<p>Spark Connect is available and supports PySpark and Scala
applications. We will walk through how to run an Apache Spark server with Spark
Connect and connect to it from a client application using the Spark Connect client
library.</p>
<h2 id="download-and-start-spark-server-with-spark-connect">Download and start Spark server with Spark Connect</h2>
<p>First, download Spark from the
<a href="https://spark.apache.org/downloads.html">Download Apache Spark</a> page. Choose the
latest release in the release drop down at the top of the page. Then choose your package type, typically
“Pre-built for Apache Hadoop 3.3 and later”, and click the link to download.</p>
<p>Now extract the Spark package you just downloaded on your computer, for example:</p>
<figure class="highlight"><pre><code class="language-bash" data-lang="bash"><span class="nb">tar</span> <span class="nt">-xvf</span> spark-4.0.1-bin-hadoop3.tgz</code></pre></figure>
<p>In a terminal window, go to the <code class="language-plaintext highlighter-rouge">spark</code> folder in the location where you extracted
Spark before and run the <code class="language-plaintext highlighter-rouge">start-connect-server.sh</code> script to start Spark server with
Spark Connect, like in this example:</p>
<figure class="highlight"><pre><code class="language-bash" data-lang="bash">./sbin/start-connect-server.sh</code></pre></figure>
<p>Make sure to use the same version of the package as the Spark version you
downloaded previously. In this example, Spark 4.0.1 with Scala 2.13.</p>
<p>Now Spark server is running and ready to accept Spark Connect sessions from client
applications. In the next section we will walk through how to use Spark Connect
when writing client applications.</p>
<h2 id="use-spark-connect-for-interactive-analysis">Use Spark Connect for interactive analysis</h2>
<div class="codetabs">
<div data-lang="python">
<p>When creating a Spark session, you can specify that you want to use Spark Connect
and there are a few ways to do that outlined as follows.</p>
<p>If you do not use one of the mechanisms outlined here, your Spark session will
work just like before, without leveraging Spark Connect.</p>
<h3 id="set-sparkremote-environment-variable">Set SPARK_REMOTE environment variable</h3>
<p>If you set the <code class="language-plaintext highlighter-rouge">SPARK_REMOTE</code> environment variable on the client machine where your
Spark client application is running and create a new Spark Session as in the following
example, the session will be a Spark Connect session. With this approach, there is no
code change needed to start using Spark Connect.</p>
<p>In a terminal window, set the <code class="language-plaintext highlighter-rouge">SPARK_REMOTE</code> environment variable to point to the
local Spark server you started previously on your computer:</p>
<figure class="highlight"><pre><code class="language-bash" data-lang="bash"><span class="nb">export </span><span class="nv">SPARK_REMOTE</span><span class="o">=</span><span class="s2">"sc://localhost"</span></code></pre></figure>
<p>And start the Spark shell as usual:</p>
<figure class="highlight"><pre><code class="language-bash" data-lang="bash">./bin/pyspark</code></pre></figure>
<p>The PySpark shell is now connected to Spark using Spark Connect as indicated in the welcome message:</p>
<figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">Client</span> <span class="n">connected</span> <span class="n">to</span> <span class="n">the</span> <span class="n">Spark</span> <span class="n">Connect</span> <span class="n">server</span> <span class="n">at</span> <span class="n">localhost</span></code></pre></figure>
<h3 id="specify-spark-connect-when-creating-spark-session">Specify Spark Connect when creating Spark session</h3>
<p>You can also specify that you want to use Spark Connect explicitly when you
create a Spark session.</p>
<p>For example, you can launch the PySpark shell with Spark Connect as
illustrated here.</p>
<p>To launch the PySpark shell with Spark Connect, simply include the <code class="language-plaintext highlighter-rouge">remote</code>
parameter and specify the location of your Spark server. We are using <code class="language-plaintext highlighter-rouge">localhost</code>
in this example to connect to the local Spark server we started previously:</p>
<figure class="highlight"><pre><code class="language-bash" data-lang="bash">./bin/pyspark <span class="nt">--remote</span> <span class="s2">"sc://localhost"</span></code></pre></figure>
<p>And you will notice that the PySpark shell welcome message tells you that
you have connected to Spark using Spark Connect:</p>
<figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">Client</span> <span class="n">connected</span> <span class="n">to</span> <span class="n">the</span> <span class="n">Spark</span> <span class="n">Connect</span> <span class="n">server</span> <span class="n">at</span> <span class="n">localhost</span></code></pre></figure>
<p>You can also check the Spark session type. If it includes <code class="language-plaintext highlighter-rouge">.connect.</code> you
are using Spark Connect as shown in this example:</p>
<figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">SparkSession</span> <span class="n">available</span> <span class="k">as</span> <span class="sh">'</span><span class="s">spark</span><span class="sh">'</span><span class="p">.</span>
<span class="o">&gt;&gt;&gt;</span> <span class="nf">type</span><span class="p">(</span><span class="n">spark</span><span class="p">)</span>
<span class="o">&lt;</span><span class="k">class</span> <span class="err">'</span><span class="nc">pyspark</span><span class="p">.</span><span class="n">sql</span><span class="p">.</span><span class="n">connect</span><span class="p">.</span><span class="n">session</span><span class="p">.</span><span class="n">SparkSession</span><span class="sh">'</span><span class="s">&gt;</span></code></pre></figure>
<p>Now you can run PySpark code in the shell to see Spark Connect in action:</p>
<figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="o">&gt;&gt;&gt;</span> <span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="sh">"</span><span class="s">id</span><span class="sh">"</span><span class="p">,</span> <span class="sh">"</span><span class="s">name</span><span class="sh">"</span><span class="p">]</span>
<span class="o">&gt;&gt;&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="p">[(</span><span class="mi">1</span><span class="p">,</span><span class="sh">"</span><span class="s">Sarah</span><span class="sh">"</span><span class="p">),</span> <span class="p">(</span><span class="mi">2</span><span class="p">,</span><span class="sh">"</span><span class="s">Maria</span><span class="sh">"</span><span class="p">)]</span>
<span class="o">&gt;&gt;&gt;</span> <span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="nf">createDataFrame</span><span class="p">(</span><span class="n">data</span><span class="p">).</span><span class="nf">toDF</span><span class="p">(</span><span class="o">*</span><span class="n">columns</span><span class="p">)</span>
<span class="o">&gt;&gt;&gt;</span> <span class="n">df</span><span class="p">.</span><span class="nf">show</span><span class="p">()</span>
<span class="o">+---+-----+</span>
<span class="o">|</span> <span class="nb">id</span><span class="o">|</span> <span class="n">name</span><span class="o">|</span>
<span class="o">+---+-----+</span>
<span class="o">|</span> <span class="mi">1</span><span class="o">|</span><span class="n">Sarah</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">2</span><span class="o">|</span><span class="n">Maria</span><span class="o">|</span>
<span class="o">+---+-----+</span></code></pre></figure>
</div>
<div data-lang="scala">
<p>For the Scala shell, we use an Ammonite-based REPL. Otherwise, very similar with PySpark shell.</p>
<figure class="highlight"><pre><code class="language-bash" data-lang="bash">./bin/spark-shell <span class="nt">--remote</span> <span class="s2">"sc://localhost"</span></code></pre></figure>
<p>A greeting message will appear when the REPL successfully initializes:</p>
<figure class="highlight"><pre><code class="language-bash" data-lang="bash">Welcome to
____ __
/ __/__ ___ _____/ /__
_<span class="se">\ \/</span> _ <span class="se">\/</span> _ <span class="sb">`</span>/ __/ <span class="s1">'_/
/___/ .__/\_,_/_/ /_/\_\ version 4.0.0-SNAPSHOT
/_/
Type in expressions to have them evaluated.
Spark session available as '</span>spark<span class="s1">'.</span></code></pre></figure>
<p>By default, the REPL will attempt to connect to a local Spark Server.
Run the following Scala code in the shell to see Spark Connect in action:</p>
<figure class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">@</span> <span class="nv">spark</span><span class="o">.</span><span class="py">range</span><span class="o">(</span><span class="mi">10</span><span class="o">).</span><span class="py">count</span>
<span class="n">res0</span><span class="k">:</span> <span class="kt">Long</span> <span class="o">=</span> <span class="mi">10L</span></code></pre></figure>
<h3 id="configure-client-server-connection">Configure client-server connection</h3>
<p>By default, the REPL will attempt to connect to a local Spark Server on port 15002.
The connection, however, may be configured in several ways as described in this configuration
<a href="https://github.com/apache/spark/blob/master/sql/connect/docs/client-connection-string.md">reference</a>.</p>
<h4 id="set-sparkremote-environment-variable-1">Set SPARK_REMOTE environment variable</h4>
<p>The SPARK_REMOTE environment variable can be set on the client machine to customize the client-server
connection that is initialized at REPL startup.</p>
<figure class="highlight"><pre><code class="language-bash" data-lang="bash"><span class="nb">export </span><span class="nv">SPARK_REMOTE</span><span class="o">=</span><span class="s2">"sc://myhost.com:443/;token=ABCDEFG"</span>
./bin/spark-shell</code></pre></figure>
<p>or</p>
<figure class="highlight"><pre><code class="language-bash" data-lang="bash"><span class="nv">SPARK_REMOTE</span><span class="o">=</span><span class="s2">"sc://myhost.com:443/;token=ABCDEFG"</span> spark-connect-repl</code></pre></figure>
<h4 id="configure-programmatically-with-a-connection-string">Configure programmatically with a connection string</h4>
<p>The connection may also be programmatically created using <em>SparkSession#builder</em> as in this example:</p>
<figure class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">@</span> <span class="k">import</span> <span class="nn">org.apache.spark.sql.SparkSession</span>
<span class="k">@</span> <span class="k">val</span> <span class="nv">spark</span> <span class="k">=</span> <span class="nv">SparkSession</span><span class="o">.</span><span class="py">builder</span><span class="o">.</span><span class="py">remote</span><span class="o">(</span><span class="s">"sc://localhost:443/;token=ABCDEFG"</span><span class="o">).</span><span class="py">getOrCreate</span><span class="o">()</span></code></pre></figure>
</div>
</div>
<h2 id="use-spark-connect-in-standalone-applications">Use Spark Connect in standalone applications</h2>
<div class="codetabs">
<div data-lang="python">
<p>First, install PySpark with <code class="language-plaintext highlighter-rouge">pip install pyspark[connect]==4.0.1</code> or if building a packaged PySpark application/library,
add it your setup.py file as:</p>
<figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="n">install_requires</span><span class="o">=</span><span class="p">[</span>
<span class="sh">'</span><span class="s">pyspark[connect]==4.0.1</span><span class="sh">'</span>
<span class="p">]</span></code></pre></figure>
<p>When writing your own code, include the <code class="language-plaintext highlighter-rouge">remote</code> function with a reference to
your Spark server when you create a Spark session, as in this example:</p>
<figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="n">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span>
<span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="p">.</span><span class="n">builder</span><span class="p">.</span><span class="nf">remote</span><span class="p">(</span><span class="sh">"</span><span class="s">sc://localhost</span><span class="sh">"</span><span class="p">).</span><span class="nf">getOrCreate</span><span class="p">()</span></code></pre></figure>
<p>For illustration purposes, we’ll create a simple Spark Connect application, SimpleApp.py:</p>
<figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="sh">"""</span><span class="s">SimpleApp.py</span><span class="sh">"""</span>
<span class="kn">from</span> <span class="n">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span>
<span class="n">logFile</span> <span class="o">=</span> <span class="sh">"</span><span class="s">YOUR_SPARK_HOME/README.md</span><span class="sh">"</span> <span class="c1"># Should be some file on your system
</span><span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span><span class="p">.</span><span class="n">builder</span><span class="p">.</span><span class="nf">remote</span><span class="p">(</span><span class="sh">"</span><span class="s">sc://localhost</span><span class="sh">"</span><span class="p">).</span><span class="nf">appName</span><span class="p">(</span><span class="sh">"</span><span class="s">SimpleApp</span><span class="sh">"</span><span class="p">).</span><span class="nf">getOrCreate</span><span class="p">()</span>
<span class="n">logData</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="nf">text</span><span class="p">(</span><span class="n">logFile</span><span class="p">).</span><span class="nf">cache</span><span class="p">()</span>
<span class="n">numAs</span> <span class="o">=</span> <span class="n">logData</span><span class="p">.</span><span class="nf">filter</span><span class="p">(</span><span class="n">logData</span><span class="p">.</span><span class="n">value</span><span class="p">.</span><span class="nf">contains</span><span class="p">(</span><span class="sh">'</span><span class="s">a</span><span class="sh">'</span><span class="p">)).</span><span class="nf">count</span><span class="p">()</span>
<span class="n">numBs</span> <span class="o">=</span> <span class="n">logData</span><span class="p">.</span><span class="nf">filter</span><span class="p">(</span><span class="n">logData</span><span class="p">.</span><span class="n">value</span><span class="p">.</span><span class="nf">contains</span><span class="p">(</span><span class="sh">'</span><span class="s">b</span><span class="sh">'</span><span class="p">)).</span><span class="nf">count</span><span class="p">()</span>
<span class="nf">print</span><span class="p">(</span><span class="sh">"</span><span class="s">Lines with a: %i, lines with b: %i</span><span class="sh">"</span> <span class="o">%</span> <span class="p">(</span><span class="n">numAs</span><span class="p">,</span> <span class="n">numBs</span><span class="p">))</span>
<span class="n">spark</span><span class="p">.</span><span class="nf">stop</span><span class="p">()</span></code></pre></figure>
<p>This program just counts the number of lines containing ‘a’ and the number containing ‘b’ in a text file.
Note that you’ll need to replace YOUR_SPARK_HOME with the location where Spark is installed.</p>
<p>We can run this application with the regular Python interpreter as follows:</p>
<figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="c1"># Use the Python interpreter to run your application
</span><span class="err">$</span> <span class="n">python</span> <span class="n">SimpleApp</span><span class="p">.</span><span class="n">py</span>
<span class="bp">...</span>
<span class="n">Lines</span> <span class="k">with</span> <span class="n">a</span><span class="p">:</span> <span class="mi">72</span><span class="p">,</span> <span class="n">lines</span> <span class="k">with</span> <span class="n">b</span><span class="p">:</span> <span class="mi">39</span></code></pre></figure>
</div>
<div data-lang="scala">
<p>To use Spark Connect as part of a Scala application/project, we first need to include the right dependencies.
Using the <code class="language-plaintext highlighter-rouge">sbt</code> build system as an example, we add the following dependencies to the <code class="language-plaintext highlighter-rouge">build.sbt</code> file:</p>
<figure class="highlight"><pre><code class="language-sbt" data-lang="sbt">libraryDependencies += "org.apache.spark" %% "spark-connect-client-jvm" % "4.0.1"</code></pre></figure>
<p>When writing your own code, include the <code class="language-plaintext highlighter-rouge">remote</code> function with a reference to
your Spark server when you create a Spark session, as in this example:</p>
<figure class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.sql.SparkSession</span>
<span class="k">val</span> <span class="nv">spark</span> <span class="k">=</span> <span class="nv">SparkSession</span><span class="o">.</span><span class="py">builder</span><span class="o">().</span><span class="py">remote</span><span class="o">(</span><span class="s">"sc://localhost"</span><span class="o">).</span><span class="py">getOrCreate</span><span class="o">()</span></code></pre></figure>
<p><strong>Note</strong>: Operations that reference User Defined Code such as UDFs, filter, map, etc require a
<a href="https://github.com/apache/spark/blob/master/sql/connect/common/src/main/scala/org/apache/spark/sql/connect/client/ClassFinder.scala">ClassFinder</a>
to be registered to pickup and upload any required classfiles. Also, any JAR dependencies must be uploaded to the server using <code class="language-plaintext highlighter-rouge">SparkSession#AddArtifact</code>.</p>
<p>Example:</p>
<figure class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.sql.connect.client.REPLClassDirMonitor</span>
<span class="c1">// Register a ClassFinder to monitor and upload the classfiles from the build output.</span>
<span class="k">val</span> <span class="nv">classFinder</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">REPLClassDirMonitor</span><span class="o">(&lt;</span><span class="nc">ABSOLUTE_PATH_TO_BUILD_OUTPUT_DIR</span><span class="o">&gt;)</span>
<span class="nv">spark</span><span class="o">.</span><span class="py">registerClassFinder</span><span class="o">(</span><span class="n">classFinder</span><span class="o">)</span>
<span class="c1">// Upload JAR dependencies</span>
<span class="nv">spark</span><span class="o">.</span><span class="py">addArtifact</span><span class="o">(&lt;</span><span class="nc">ABSOLUTE_PATH_JAR_DEP</span><span class="o">&gt;)</span></code></pre></figure>
<p>Here, <code class="language-plaintext highlighter-rouge">ABSOLUTE_PATH_TO_BUILD_OUTPUT_DIR</code> is the output directory where the build system writes classfiles into
and <code class="language-plaintext highlighter-rouge">ABSOLUTE_PATH_JAR_DEP</code> is the location of the JAR on the local file system.</p>
<p>The <code class="language-plaintext highlighter-rouge">REPLClassDirMonitor</code> is a provided implementation of <code class="language-plaintext highlighter-rouge">ClassFinder</code> that monitors a specific directory but
one may implement their own class extending <code class="language-plaintext highlighter-rouge">ClassFinder</code> for customized search and monitoring.</p>
</div>
</div>
<p>For more information on application development with Spark Connect as well as extending Spark Connect
with custom functionality, see <a href="app-dev-spark-connect.html">Application Development with Spark Connect</a>.</p>
<h1 id="client-application-authentication">Client application authentication</h1>
<p>While Spark Connect does not have built-in authentication, it is designed to
work seamlessly with your existing authentication infrastructure. Its gRPC
HTTP/2 interface allows for the use of authenticating proxies, which makes
it possible to secure Spark Connect without having to implement authentication
logic in Spark directly.</p>
<h1 id="what-is-supported">What is supported</h1>
<p><strong>PySpark</strong>: Since Spark 3.4, Spark Connect supports most PySpark APIs, including
<a href="api/python/reference/pyspark.sql/dataframe.html">DataFrame</a>,
<a href="api/python/reference/pyspark.sql/functions.html">Functions</a>, and
<a href="api/python/reference/pyspark.sql/column.html">Column</a>. However,
some APIs such as <a href="api/python/reference/api/pyspark.SparkContext.html">SparkContext</a>
and <a href="api/python/reference/api/pyspark.RDD.html">RDD</a> are not supported.
You can check which APIs are currently
supported in the <a href="api/python/reference/index.html">API reference</a> documentation.
Supported APIs are labeled &#8220;Supports Spark Connect&#8221; so you can check whether the
APIs you are using are available before migrating existing code to Spark Connect.</p>
<p><strong>Scala</strong>: Since Spark 3.5, Spark Connect supports most Scala APIs, including
<a href="api/scala/org/apache/spark/sql/Dataset.html">Dataset</a>,
<a href="api/scala/org/apache/spark/sql/functions$.html">functions</a>,
<a href="api/scala/org/apache/spark/sql/Column.html">Column</a>,
<a href="api/scala/org/apache/spark/sql/catalog/Catalog.html">Catalog</a> and
<a href="api/scala/org/apache/spark/sql/KeyValueGroupedDataset.html">KeyValueGroupedDataset</a>.</p>
<p>User-Defined Functions (UDFs) are supported, by default for the shell and in standalone applications with
additional set-up requirements.</p>
<p>Majority of the Streaming API is supported, including
<a href="api/scala/org/apache/spark/sql/streaming/DataStreamReader.html">DataStreamReader</a>,
<a href="api/scala/org/apache/spark/sql/streaming/DataStreamWriter.htmll">DataStreamWriter</a>,
<a href="api/scala/org/apache/spark/sql/streaming/StreamingQuery.html">StreamingQuery</a> and
<a href="api/scala/org/apache/spark/sql/streaming/StreamingQueryListener.html">StreamingQueryListener</a>.</p>
<p>APIs such as <a href="api/scala/org/apache/spark/SparkContext.html">SparkContext</a>
and <a href="api/scala/org/apache/spark/rdd/RDD.html">RDD</a> are unsupported in Spark Connect.</p>
<p>Support for more APIs is planned for upcoming Spark releases.</p>
</div>
<!-- /container -->
</div>
<script src="js/vendor/jquery-3.5.1.min.js"></script>
<script src="js/vendor/bootstrap.bundle.min.js"></script>
<script src="js/vendor/anchor.min.js"></script>
<script src="js/main.js"></script>
<script type="text/javascript" src="js/vendor/docsearch.min.js"></script>
<script type="text/javascript">
// DocSearch is entirely free and automated. DocSearch is built in two parts:
// 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link
// in your website and extract content from every page it traverses. It then pushes this
// content to an Algolia index.
// 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index
// to your search input and display its results in a dropdown UI. If you want to find more
// details on how works DocSearch, check the docs of DocSearch.
docsearch({
apiKey: 'd62f962a82bc9abb53471cb7b89da35e',
appId: 'RAI69RXRSK',
indexName: 'apache_spark',
inputSelector: '#docsearch-input',
enhancedSearchInput: true,
algoliaOptions: {
'facetFilters': ["version:4.0.1"]
},
debug: false // Set debug to true if you want to inspect the dropdown
});
</script>
<!-- MathJax Section -->
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
TeX: { equationNumbers: { autoNumber: "AMS" } }
});
</script>
<script>
// Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
// We could use "//cdn.mathjax...", but that won't support "file://".
(function(d, script) {
script = d.createElement('script');
script.type = 'text/javascript';
script.async = true;
script.onload = function(){
MathJax.Hub.Config({
tex2jax: {
inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
processEscapes: true,
skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
}
});
};
script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' +
'?config=TeX-AMS-MML_HTMLorMML';
d.getElementsByTagName('head')[0].appendChild(script);
}(document));
</script>
</body>
</html>