blob: 822313d7453d8230da87df6ad57145c445521cf3 [file] [log] [blame]
<!DOCTYPE html>
<html class="no-js">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Application Development with Spark Connect - Spark 4.1.0-preview1 Documentation</title>
<link rel="stylesheet" href="css/bootstrap.min.css">
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,wght@0,400;0,500;0,700;1,400;1,500;1,700&Courier+Prime:wght@400;700&display=swap" rel="stylesheet">
<link href="css/custom.css" rel="stylesheet">
<script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
<link rel="stylesheet" href="css/pygments-default.css">
<link rel="stylesheet" href="css/docsearch.min.css" />
<link rel="stylesheet" href="css/docsearch.css">
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body class="global">
<!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
<nav class="navbar navbar-expand-lg navbar-dark p-0 px-4 fixed-top" style="background: #1d6890;" id="topbar">
<div class="navbar-brand"><a href="index.html">
<img src="https://spark.apache.org/images/spark-logo-rev.svg" width="141" height="72"/></a><span class="version">4.1.0-preview1</span>
</div>
<button class="navbar-toggler" type="button" data-toggle="collapse"
data-target="#navbarCollapse" aria-controls="navbarCollapse"
aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarCollapse">
<ul class="navbar-nav me-auto">
<li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a>
<div class="dropdown-menu" aria-labelledby="navbarQuickStart">
<a class="dropdown-item" href="quick-start.html">Quick Start</a>
<a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a>
<a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a>
<a class="dropdown-item" href="streaming/index.html">Structured Streaming</a>
<a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a>
<a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a>
<a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a>
<a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a>
<a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a>
<a class="dropdown-item" href="declarative-pipelines-programming-guide.html">Declarative Pipelines</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a>
<div class="dropdown-menu" aria-labelledby="navbarAPIDocs">
<a class="dropdown-item" href="api/python/index.html">Python</a>
<a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a>
<a class="dropdown-item" href="api/java/index.html">Java</a>
<a class="dropdown-item" href="api/R/index.html">R</a>
<a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a>
<div class="dropdown-menu" aria-labelledby="navbarDeploying">
<a class="dropdown-item" href="cluster-overview.html">Overview</a>
<a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a>
<a class="dropdown-item" href="running-on-yarn.html">YARN</a>
<a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a>
<div class="dropdown-menu" aria-labelledby="navbarMore">
<a class="dropdown-item" href="configuration.html">Configuration</a>
<a class="dropdown-item" href="monitoring.html">Monitoring</a>
<a class="dropdown-item" href="tuning.html">Tuning Guide</a>
<a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a>
<a class="dropdown-item" href="security.html">Security</a>
<a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a>
<a class="dropdown-item" href="migration-guide.html">Migration Guide</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="building-spark.html">Building Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a>
</div>
</li>
<li class="nav-item">
<input type="text" id="docsearch-input" placeholder="Search the docs…">
</li>
</ul>
<!--<span class="navbar-text navbar-right"><span class="version-text">v4.1.0-preview1</span></span>-->
</div>
</nav>
<div class="container">
<div class="content mr-3" id="content">
<h1 class="title">Application Development with Spark Connect</h1>
<p><strong>Spark Connect Overview</strong></p>
<p>In Apache Spark 3.4, Spark Connect introduced a decoupled client-server
architecture that allows remote connectivity to Spark clusters using the
DataFrame API and unresolved logical plans as the protocol. The separation
between client and server allows Spark and its open ecosystem to be
leveraged from everywhere. It can be embedded in modern data applications,
in IDEs, Notebooks and programming languages.</p>
<p>To learn more about Spark Connect, see <a href="spark-connect-overview.html">Spark Connect Overview</a>.</p>
<h1 id="redefining-spark-applications-using-spark-connect">Redefining Spark Applications using Spark Connect</h1>
<p>With its decoupled client-server architecture, Spark Connect simplifies how Spark Applications are
developed.
The notion of Spark Client Applications and Spark Server Libraries are introduced as follows:</p>
<ul>
<li><em>Spark Client Applications</em> are regular Spark applications that use Spark and its rich ecosystem for
distributed data processing. Examples include ETL pipelines, data preparation, and model training
and inference.</li>
<li><em>Spark Server Libraries</em> build on, extend, and complement Spark&#8217;s functionality, e.g.
<a href="ml-guide.html">MLlib</a> (distributed ML libraries that use Spark&#8217;s powerful distributed processing). Spark Connect
can be extended to expose client-side interfaces for Spark Server Libraries.</li>
</ul>
<p>With Spark 3.4 and Spark Connect, the development of Spark Client Applications is simplified, and
clear extension points and guidelines are provided on how to build Spark Server Libraries, making
it easy for both types of applications to evolve alongside Spark. As illustrated in Fig.1, Spark
Client applications connect to Spark using the Spark Connect API, which is essentially the
DataFrame API and fully declarative.</p>
<p style="text-align: center;">
<img src="img/extending-spark-connect.png" title="Figure 1: Architecture" alt="Extending Spark
Connect Diagram" />
</p>
<p>Spark Server Libraries extend Spark. They typically provide additional server-side logic integrated
with Spark, which is exposed to client applications as part of the Spark Connect API, using Spark
Connect extension points. For example, the <em>Spark Server Library</em> consists of custom
service-side logic (as indicated by the blue box labeled <em>Custom Library Plugin</em>), which is exposed
to the client via the blue box as part of the Spark Connect API. The client uses this API, e.g.,
alongside PySpark or the Spark Scala client, making it easy for Spark client applications to work
with the custom logic/library.</p>
<h2 id="spark-api-mode-spark-client-and-spark-classic">Spark API Mode: Spark Client and Spark Classic</h2>
<p>Spark provides the API mode, <code class="language-plaintext highlighter-rouge">spark.api.mode</code> configuration, enabling Spark Classic applications
to seamlessly switch to Spark Connect. Depending on the value of <code class="language-plaintext highlighter-rouge">spark.api.mode</code>, the application
can run in either Spark Classic or Spark Connect mode. Here is an example:</p>
<figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="n">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span>
<span class="n">SparkSession</span><span class="p">.</span><span class="n">builder</span><span class="p">.</span><span class="nf">config</span><span class="p">(</span><span class="sh">"</span><span class="s">spark.api.mode</span><span class="sh">"</span><span class="p">,</span> <span class="sh">"</span><span class="s">connect</span><span class="sh">"</span><span class="p">).</span><span class="nf">master</span><span class="p">(</span><span class="sh">"</span><span class="s">...</span><span class="sh">"</span><span class="p">).</span><span class="nf">getOrCreate</span><span class="p">()</span></code></pre></figure>
<p>You can also apply this configuration to both Scala and PySpark applications when submitting yours:</p>
<figure class="highlight"><pre><code class="language-bash" data-lang="bash">spark-submit <span class="nt">--master</span> <span class="s2">"..."</span> <span class="nt">--conf</span> spark.api.mode<span class="o">=</span>connect</code></pre></figure>
<p>Additionally, Spark Connect offers convenient options for local testing. By setting <code class="language-plaintext highlighter-rouge">spark.remote</code>
to <code class="language-plaintext highlighter-rouge">local[...]</code> or <code class="language-plaintext highlighter-rouge">local-cluster[...]</code>, you can start a local Spark Connect server and access a Spark
Connect session.</p>
<p>This is similar to using <code class="language-plaintext highlighter-rouge">--conf spark.api.mode=connect</code> with <code class="language-plaintext highlighter-rouge">--master ...</code>. However, note that
<code class="language-plaintext highlighter-rouge">spark.remote</code> and <code class="language-plaintext highlighter-rouge">--remote</code> are limited to <code class="language-plaintext highlighter-rouge">local*</code> values, while <code class="language-plaintext highlighter-rouge">--conf spark.api.mode=connect</code>
with <code class="language-plaintext highlighter-rouge">--master ...</code> supports additional cluster URLs, such as spark://, for broader compatibility with
Spark Classic.</p>
<h2 id="spark-client-applications">Spark Client Applications</h2>
<p>Spark Client Applications are the <em>regular Spark applications</em> that Spark users develop today, e.g.,
ETL pipelines, data preparation, or model training or inference. These are typically built using
Sparks declarative DataFrame and DataSet APIs. With Spark Connect, the core behaviour remains the
same, but there are a few differences:</p>
<ul>
<li>Lower-level, non-declarative APIs (RDDs) can no longer be directly used from Spark Client
applications. Alternatives for missing RDD functionality are provided as part of the higher-level
DataFrame API.</li>
<li>Client applications no longer have direct access to the Spark driver JVM; they are fully
separated from the server.</li>
</ul>
<p>Client applications based on Spark Connect can be submitted in the same way as any previous job.
In addition, Spark Client Applications based on Spark Connect have several benefits compared to
classic Spark applications using earlier Spark versions (3.4 and below):</p>
<ul>
<li><em>Upgradability</em>: Upgrading to new Spark Server versions is seamless, as the Spark Connect API
abstracts any changes/improvements on the server side. Client- and server APIs are cleanly
separated.</li>
<li><em>Simplicity</em>: The number of APIs exposed to the user is reduced from 3 to 2. The Spark Connect API
is fully declarative and consequently easy to learn for new users familiar with SQL.</li>
<li><em>Stability</em>: When using Spark Connect, the client applications no longer run on the Spark driver
and, therefore don’t cause and are not affected by any instability on the server.</li>
<li><em>Remote connectivity</em>: The decoupled architecture allows remote connectivity to Spark beyond SQL
and JDBC: any application can now interactively use Spark “as a service”.</li>
<li><em>Backwards compatibility</em>: The Spark Connect API is code-compatible with earlier Spark versions,
except for the usage of RDDs, for which a list of alternative APIs is provided in Spark Connect.</li>
</ul>
<h2 id="spark-server-libraries">Spark Server Libraries</h2>
<p>Until Spark 3.4, extensions to Spark (e.g., <a href="ml-guide#:~:text=What%20is%20%E2%80%9CSpark%20ML%E2%80%9D%3F,to%20emphasize%20the%20pipeline%20concept.">Spark ML</a>
or <a href="https://github.com/JohnSnowLabs/spark-nlp">Spark-NLP</a>) were built and deployed like Spark
Client Applications. With Spark 3.4 and Spark Connect, explicit extension points are offered to
extend Spark via Spark Server Libraries. These extension points provide functionality that can be
exposed to a client, which differs from existing extension points in Spark such as
<a href="api/java/org/apache/spark/sql/SparkSessionExtensions.html">SparkSession extensions</a> or
<a href="api/java/org/apache/spark/api/plugin/SparkPlugin.html">Spark Plugins</a>.</p>
<h3 id="getting-started-extending-spark-with-spark-server-libraries">Getting Started: Extending Spark with Spark Server Libraries</h3>
<p>Spark Connect is available and supports PySpark and Scala
applications. We will walk through how to run an Apache Spark server with Spark
Connect and connect to it from a client application using the Spark Connect client
library.</p>
<p>A Spark Server Library consists of the following components, illustrated in Fig. 2:</p>
<ol>
<li>The Spark Connect protocol extension (blue box <em>Proto</em> API)</li>
<li>A Spark Connect Plugin.</li>
<li>The application logic that extends Spark.</li>
<li>The client package that exposes the Spark Server Library application logic to the Spark Client
Application, alongside PySpark or the Scala Spark Client.</li>
</ol>
<p style="text-align: center;">
<img src="img/extending-spark-connect-labelled.png" title="Figure 2: Labelled Architecture" alt="Extending Spark
Connect Diagram - Labelled Steps" />
</p>
<h4 id="1-spark-connect-protocol-extension">(1) Spark Connect Protocol Extension</h4>
<p>To extend Spark with a new Spark Server Library, developers can extend the three main operation
types in the Spark Connect protocol: <em>Relation</em>, <em>Expression</em>, and <em>Command</em>.</p>
<figure class="highlight"><pre><code class="language-protobuf" data-lang="protobuf"><span class="kd">message</span> <span class="nc">Relation</span> <span class="p">{</span>
<span class="k">oneof</span> <span class="n">rel_type</span> <span class="p">{</span>
<span class="n">Read</span> <span class="na">read</span> <span class="o">=</span> <span class="mi">1</span><span class="p">;</span>
<span class="c1">// ...</span>
<span class="n">google.protobuf.Any</span> <span class="na">extension</span> <span class="o">=</span> <span class="mi">998</span><span class="p">;</span>
<span class="p">}</span>
<span class="p">}</span>
<span class="kd">message</span> <span class="nc">Expression</span> <span class="p">{</span>
<span class="k">oneof</span> <span class="n">expr_type</span> <span class="p">{</span>
<span class="n">Literal</span> <span class="na">literal</span> <span class="o">=</span> <span class="mi">1</span><span class="p">;</span>
<span class="c1">// ...</span>
<span class="n">google.protobuf.Any</span> <span class="na">extension</span> <span class="o">=</span> <span class="mi">999</span><span class="p">;</span>
<span class="p">}</span>
<span class="p">}</span>
<span class="kd">message</span> <span class="nc">Command</span> <span class="p">{</span>
<span class="k">oneof</span> <span class="n">command_type</span> <span class="p">{</span>
<span class="n">WriteCommand</span> <span class="na">write_command</span> <span class="o">=</span> <span class="mi">1</span><span class="p">;</span>
<span class="c1">// ...</span>
<span class="n">google.protobuf.Any</span> <span class="na">extension</span> <span class="o">=</span> <span class="mi">999</span><span class="p">;</span>
<span class="p">}</span>
<span class="p">}</span> </code></pre></figure>
<p>Their extension fields allow serializing arbitrary protobuf messages as part of the Spark Connect
protocol. These messages represent the parameters or state of the extension implementation.
To build a custom expression type, the developer first defines the custom protobuf definition
of the expression.</p>
<figure class="highlight"><pre><code class="language-protobuf" data-lang="protobuf"><span class="kd">message</span> <span class="nc">ExamplePluginExpression</span> <span class="p">{</span>
<span class="n">Expression</span> <span class="na">child</span> <span class="o">=</span> <span class="mi">1</span><span class="p">;</span>
<span class="kt">string</span> <span class="na">custom_field</span> <span class="o">=</span> <span class="mi">2</span><span class="p">;</span>
<span class="p">}</span></code></pre></figure>
<h4 id="2-spark-connect-plugin-implementation-with-3-custom-application-logic">(2) Spark Connect Plugin implementation with (3) custom application logic</h4>
<p>As a next step, the developer implements the <em>ExpressionPlugin</em> class of Spark Connect with custom
application logic based on the input parameters of the protobuf message.</p>
<figure class="highlight"><pre><code class="language-protobuf" data-lang="protobuf"><span class="n">class</span> <span class="n">ExampleExpressionPlugin</span> <span class="n">extends</span> <span class="n">ExpressionPlugin</span> <span class="p">{</span>
<span class="n">override</span> <span class="n">def</span> <span class="n">transform</span><span class="p">(</span>
<span class="n">relation</span><span class="o">:</span> <span class="n">protobuf.Any</span><span class="p">,</span>
<span class="n">planner</span><span class="o">:</span> <span class="n">SparkConnectPlanner</span><span class="p">)</span><span class="o">:</span> <span class="n">Option</span><span class="p">[</span><span class="n">Expression</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span>
<span class="c1">// Check if the serialized value of protobuf.Any matches the type</span>
<span class="c1">// of our example expression.</span>
<span class="n">if</span> <span class="p">(</span><span class="err">!</span><span class="n">relation.is</span><span class="p">(</span><span class="n">classOf</span><span class="p">[</span><span class="n">proto.ExamplePluginExpression</span><span class="p">]))</span> <span class="p">{</span>
<span class="n">return</span> <span class="n">None</span>
<span class="p">}</span>
<span class="n">val</span> <span class="na">exp</span> <span class="o">=</span> <span class="n">relation.unpack</span><span class="p">(</span><span class="n">classOf</span><span class="p">[</span><span class="n">proto.ExamplePluginExpression</span><span class="p">])</span>
<span class="n">Some</span><span class="p">(</span><span class="n">Alias</span><span class="p">(</span><span class="n">planner.transformExpression</span><span class="p">(</span>
<span class="n">exp.getChild</span><span class="p">),</span> <span class="n">exp.getCustomField</span><span class="p">)(</span><span class="na">explicitMetadata</span> <span class="o">=</span> <span class="n">None</span><span class="p">))</span>
<span class="p">}</span>
<span class="p">}</span></code></pre></figure>
<p>Once the application logic is developed, the code must be packaged as a jar and Spark must be
configured to pick up the additional logic. The relevant Spark configuration options are:</p>
<ul>
<li><em>spark.jars</em> which define the location of the Jar file containing the application logic built for
the custom expression.</li>
<li><em>spark.connect.extensions.expression.classes</em> specifying the full class name
of each expression extension loaded by Spark. Based on these configuration options, Spark will
load the values at startup and make them available for processing.</li>
</ul>
<h4 id="4-spark-server-library-client-package">(4) Spark Server Library Client Package</h4>
<p>Once the server component is deployed, any client can use it with the right protobuf messages.
In the example above, the following message payload sent to the Spark Connect endpoint would be
enough to trigger the extension mechanism.</p>
<figure class="highlight"><pre><code class="language-json" data-lang="json"><span class="p">{</span><span class="w">
</span><span class="nl">"project"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span><span class="w">
</span><span class="nl">"input"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span><span class="w">
</span><span class="nl">"sql"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span><span class="w">
</span><span class="nl">"query"</span><span class="p">:</span><span class="w"> </span><span class="s2">"select * from samples.nyctaxi.trips"</span><span class="w">
</span><span class="p">}</span><span class="w">
</span><span class="p">},</span><span class="w">
</span><span class="nl">"expressions"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="w">
</span><span class="p">{</span><span class="w">
</span><span class="nl">"extension"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span><span class="w">
</span><span class="nl">"typeUrl"</span><span class="p">:</span><span class="w"> </span><span class="s2">"type.googleapis.com/spark.connect.ExamplePluginExpression"</span><span class="p">,</span><span class="w">
</span><span class="nl">"value"</span><span class="p">:</span><span class="w"> </span><span class="s2">"</span><span class="se">\n\0</span><span class="s2">06</span><span class="se">\0</span><span class="s2">22</span><span class="se">\0</span><span class="s2">04</span><span class="se">\n\0</span><span class="s2">02id</span><span class="se">\0</span><span class="s2">22</span><span class="se">\0</span><span class="s2">06testval"</span><span class="w">
</span><span class="p">}</span><span class="w">
</span><span class="p">}</span><span class="w">
</span><span class="p">]</span><span class="w">
</span><span class="p">}</span><span class="w">
</span><span class="p">}</span><span class="w"> </span></code></pre></figure>
<p>To make the example available in Python, the application developer provides a Python library that
wraps the new expression and embeds it into PySpark. The easiest way to provide a function for any
expression is to take a PySpark column instance as an argument and return a new Column instance
with the expression applied.</p>
<figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="n">pyspark.sql.connect.column</span> <span class="kn">import</span> <span class="n">Expression</span>
<span class="kn">import</span> <span class="n">pyspark.sql.connect.proto</span> <span class="k">as</span> <span class="n">proto</span>
<span class="kn">from</span> <span class="n">myxample.proto</span> <span class="kn">import</span> <span class="n">ExamplePluginExpression</span>
<span class="c1"># Internal class that satisfies the interface by the Python client
# of Spark Connect to generate the protobuf representation from
# an instance of the expression.
</span><span class="k">class</span> <span class="nc">ExampleExpression</span><span class="p">(</span><span class="n">Expression</span><span class="p">):</span>
<span class="k">def</span> <span class="nf">to_plan</span><span class="p">(</span><span class="n">self</span><span class="p">,</span> <span class="n">session</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">proto</span><span class="p">.</span><span class="n">Expression</span><span class="p">:</span>
<span class="n">fun</span> <span class="o">=</span> <span class="n">proto</span><span class="p">.</span><span class="nc">Expression</span><span class="p">()</span>
<span class="n">plugin</span> <span class="o">=</span> <span class="nc">ExamplePluginExpression</span><span class="p">()</span>
<span class="n">plugin</span><span class="p">.</span><span class="n">child</span><span class="p">.</span><span class="n">literal</span><span class="p">.</span><span class="nb">long</span> <span class="o">=</span> <span class="mi">10</span>
<span class="n">plugin</span><span class="p">.</span><span class="n">custom_field</span> <span class="o">=</span> <span class="sh">"</span><span class="s">example</span><span class="sh">"</span>
<span class="n">fun</span><span class="p">.</span><span class="n">extension</span><span class="p">.</span><span class="nc">Pack</span><span class="p">(</span><span class="n">plugin</span><span class="p">)</span>
<span class="k">return</span> <span class="n">fun</span>
<span class="c1"># Defining the function to be used from the consumers.
</span><span class="k">def</span> <span class="nf">example_expression</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Column</span><span class="p">:</span>
<span class="k">return</span> <span class="nc">Column</span><span class="p">(</span><span class="nc">ExampleExpression</span><span class="p">())</span>
<span class="c1"># Using the expression in the Spark Connect client code.
</span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="nf">table</span><span class="p">(</span><span class="sh">"</span><span class="s">samples.nyctaxi.trips</span><span class="sh">"</span><span class="p">)</span>
<span class="n">df</span><span class="p">.</span><span class="nf">select</span><span class="p">(</span><span class="nf">example_expression</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="sh">"</span><span class="s">fare_amount</span><span class="sh">"</span><span class="p">])).</span><span class="nf">collect</span><span class="p">()</span></code></pre></figure>
</div>
<!-- /container -->
</div>
<script src="js/vendor/jquery-3.5.1.min.js"></script>
<script src="js/vendor/bootstrap.bundle.min.js"></script>
<script src="js/vendor/anchor.min.js"></script>
<script src="js/main.js"></script>
<script type="text/javascript" src="js/vendor/docsearch.min.js"></script>
<script type="text/javascript">
// DocSearch is entirely free and automated. DocSearch is built in two parts:
// 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link
// in your website and extract content from every page it traverses. It then pushes this
// content to an Algolia index.
// 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index
// to your search input and display its results in a dropdown UI. If you want to find more
// details on how works DocSearch, check the docs of DocSearch.
docsearch({
apiKey: 'd62f962a82bc9abb53471cb7b89da35e',
appId: 'RAI69RXRSK',
indexName: 'apache_spark',
inputSelector: '#docsearch-input',
enhancedSearchInput: true,
algoliaOptions: {
'facetFilters': ["version:4.1.0-preview1"]
},
debug: false // Set debug to true if you want to inspect the dropdown
});
</script>
<!-- MathJax Section -->
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
TeX: { equationNumbers: { autoNumber: "AMS" } }
});
</script>
<script>
// Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
// We could use "//cdn.mathjax...", but that won't support "file://".
(function(d, script) {
script = d.createElement('script');
script.type = 'text/javascript';
script.async = true;
script.onload = function(){
MathJax.Hub.Config({
tex2jax: {
inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
processEscapes: true,
skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
}
});
};
script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' +
'?config=TeX-AMS-MML_HTMLorMML';
d.getElementsByTagName('head')[0].appendChild(script);
}(document));
</script>
</body>
</html>