| |
| |
| |
| |
| <!DOCTYPE html> |
| <html class="no-js"> |
| <head> |
| <meta charset="utf-8"> |
| <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| |
| <title>Application Development with Spark Connect - Spark 4.1.0-preview1 Documentation</title> |
| |
| |
| |
| |
| |
| <link rel="stylesheet" href="css/bootstrap.min.css"> |
| <link rel="preconnect" href="https://fonts.googleapis.com"> |
| <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin> |
| <link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,wght@0,400;0,500;0,700;1,400;1,500;1,700&Courier+Prime:wght@400;700&display=swap" rel="stylesheet"> |
| <link href="css/custom.css" rel="stylesheet"> |
| <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script> |
| |
| <link rel="stylesheet" href="css/pygments-default.css"> |
| <link rel="stylesheet" href="css/docsearch.min.css" /> |
| <link rel="stylesheet" href="css/docsearch.css"> |
| |
| |
| <!-- Matomo --> |
| <script> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| |
| </head> |
| <body class="global"> |
| <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html --> |
| <nav class="navbar navbar-expand-lg navbar-dark p-0 px-4 fixed-top" style="background: #1d6890;" id="topbar"> |
| <div class="navbar-brand"><a href="index.html"> |
| <img src="https://spark.apache.org/images/spark-logo-rev.svg" width="141" height="72"/></a><span class="version">4.1.0-preview1</span> |
| </div> |
| <button class="navbar-toggler" type="button" data-toggle="collapse" |
| data-target="#navbarCollapse" aria-controls="navbarCollapse" |
| aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| <div class="collapse navbar-collapse" id="navbarCollapse"> |
| <ul class="navbar-nav me-auto"> |
| <li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a> |
| <div class="dropdown-menu" aria-labelledby="navbarQuickStart"> |
| <a class="dropdown-item" href="quick-start.html">Quick Start</a> |
| <a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a> |
| <a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a> |
| <a class="dropdown-item" href="streaming/index.html">Structured Streaming</a> |
| <a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a> |
| <a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a> |
| <a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a> |
| <a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a> |
| <a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a> |
| <a class="dropdown-item" href="declarative-pipelines-programming-guide.html">Declarative Pipelines</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a> |
| <div class="dropdown-menu" aria-labelledby="navbarAPIDocs"> |
| <a class="dropdown-item" href="api/python/index.html">Python</a> |
| <a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a> |
| <a class="dropdown-item" href="api/java/index.html">Java</a> |
| <a class="dropdown-item" href="api/R/index.html">R</a> |
| <a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a> |
| <div class="dropdown-menu" aria-labelledby="navbarDeploying"> |
| <a class="dropdown-item" href="cluster-overview.html">Overview</a> |
| <a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a> |
| <div class="dropdown-divider"></div> |
| <a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a> |
| <a class="dropdown-item" href="running-on-yarn.html">YARN</a> |
| <a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a> |
| <div class="dropdown-menu" aria-labelledby="navbarMore"> |
| <a class="dropdown-item" href="configuration.html">Configuration</a> |
| <a class="dropdown-item" href="monitoring.html">Monitoring</a> |
| <a class="dropdown-item" href="tuning.html">Tuning Guide</a> |
| <a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a> |
| <a class="dropdown-item" href="security.html">Security</a> |
| <a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a> |
| <a class="dropdown-item" href="migration-guide.html">Migration Guide</a> |
| <div class="dropdown-divider"></div> |
| <a class="dropdown-item" href="building-spark.html">Building Spark</a> |
| <a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a> |
| <a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a> |
| </div> |
| </li> |
| |
| <li class="nav-item"> |
| <input type="text" id="docsearch-input" placeholder="Search the docs…"> |
| </li> |
| </ul> |
| <!--<span class="navbar-text navbar-right"><span class="version-text">v4.1.0-preview1</span></span>--> |
| </div> |
| </nav> |
| |
| |
| |
| <div class="container"> |
| |
| <div class="content mr-3" id="content"> |
| |
| |
| <h1 class="title">Application Development with Spark Connect</h1> |
| |
| |
| <p><strong>Spark Connect Overview</strong></p> |
| |
| <p>In Apache Spark 3.4, Spark Connect introduced a decoupled client-server |
| architecture that allows remote connectivity to Spark clusters using the |
| DataFrame API and unresolved logical plans as the protocol. The separation |
| between client and server allows Spark and its open ecosystem to be |
| leveraged from everywhere. It can be embedded in modern data applications, |
| in IDEs, Notebooks and programming languages.</p> |
| |
| <p>To learn more about Spark Connect, see <a href="spark-connect-overview.html">Spark Connect Overview</a>.</p> |
| |
| <h1 id="redefining-spark-applications-using-spark-connect">Redefining Spark Applications using Spark Connect</h1> |
| |
| <p>With its decoupled client-server architecture, Spark Connect simplifies how Spark Applications are |
| developed. |
| The notion of Spark Client Applications and Spark Server Libraries are introduced as follows:</p> |
| <ul> |
| <li><em>Spark Client Applications</em> are regular Spark applications that use Spark and its rich ecosystem for |
| distributed data processing. Examples include ETL pipelines, data preparation, and model training |
| and inference.</li> |
| <li><em>Spark Server Libraries</em> build on, extend, and complement Spark’s functionality, e.g. |
| <a href="ml-guide.html">MLlib</a> (distributed ML libraries that use Spark’s powerful distributed processing). Spark Connect |
| can be extended to expose client-side interfaces for Spark Server Libraries.</li> |
| </ul> |
| |
| <p>With Spark 3.4 and Spark Connect, the development of Spark Client Applications is simplified, and |
| clear extension points and guidelines are provided on how to build Spark Server Libraries, making |
| it easy for both types of applications to evolve alongside Spark. As illustrated in Fig.1, Spark |
| Client applications connect to Spark using the Spark Connect API, which is essentially the |
| DataFrame API and fully declarative.</p> |
| |
| <p style="text-align: center;"> |
| <img src="img/extending-spark-connect.png" title="Figure 1: Architecture" alt="Extending Spark |
| Connect Diagram" /> |
| </p> |
| <p>Spark Server Libraries extend Spark. They typically provide additional server-side logic integrated |
| with Spark, which is exposed to client applications as part of the Spark Connect API, using Spark |
| Connect extension points. For example, the <em>Spark Server Library</em> consists of custom |
| service-side logic (as indicated by the blue box labeled <em>Custom Library Plugin</em>), which is exposed |
| to the client via the blue box as part of the Spark Connect API. The client uses this API, e.g., |
| alongside PySpark or the Spark Scala client, making it easy for Spark client applications to work |
| with the custom logic/library.</p> |
| |
| <h2 id="spark-api-mode-spark-client-and-spark-classic">Spark API Mode: Spark Client and Spark Classic</h2> |
| |
| <p>Spark provides the API mode, <code class="language-plaintext highlighter-rouge">spark.api.mode</code> configuration, enabling Spark Classic applications |
| to seamlessly switch to Spark Connect. Depending on the value of <code class="language-plaintext highlighter-rouge">spark.api.mode</code>, the application |
| can run in either Spark Classic or Spark Connect mode. Here is an example:</p> |
| |
| <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="n">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| |
| <span class="n">SparkSession</span><span class="p">.</span><span class="n">builder</span><span class="p">.</span><span class="nf">config</span><span class="p">(</span><span class="sh">"</span><span class="s">spark.api.mode</span><span class="sh">"</span><span class="p">,</span> <span class="sh">"</span><span class="s">connect</span><span class="sh">"</span><span class="p">).</span><span class="nf">master</span><span class="p">(</span><span class="sh">"</span><span class="s">...</span><span class="sh">"</span><span class="p">).</span><span class="nf">getOrCreate</span><span class="p">()</span></code></pre></figure> |
| |
| <p>You can also apply this configuration to both Scala and PySpark applications when submitting yours:</p> |
| |
| <figure class="highlight"><pre><code class="language-bash" data-lang="bash">spark-submit <span class="nt">--master</span> <span class="s2">"..."</span> <span class="nt">--conf</span> spark.api.mode<span class="o">=</span>connect</code></pre></figure> |
| |
| <p>Additionally, Spark Connect offers convenient options for local testing. By setting <code class="language-plaintext highlighter-rouge">spark.remote</code> |
| to <code class="language-plaintext highlighter-rouge">local[...]</code> or <code class="language-plaintext highlighter-rouge">local-cluster[...]</code>, you can start a local Spark Connect server and access a Spark |
| Connect session.</p> |
| |
| <p>This is similar to using <code class="language-plaintext highlighter-rouge">--conf spark.api.mode=connect</code> with <code class="language-plaintext highlighter-rouge">--master ...</code>. However, note that |
| <code class="language-plaintext highlighter-rouge">spark.remote</code> and <code class="language-plaintext highlighter-rouge">--remote</code> are limited to <code class="language-plaintext highlighter-rouge">local*</code> values, while <code class="language-plaintext highlighter-rouge">--conf spark.api.mode=connect</code> |
| with <code class="language-plaintext highlighter-rouge">--master ...</code> supports additional cluster URLs, such as spark://, for broader compatibility with |
| Spark Classic.</p> |
| |
| <h2 id="spark-client-applications">Spark Client Applications</h2> |
| |
| <p>Spark Client Applications are the <em>regular Spark applications</em> that Spark users develop today, e.g., |
| ETL pipelines, data preparation, or model training or inference. These are typically built using |
| Sparks declarative DataFrame and DataSet APIs. With Spark Connect, the core behaviour remains the |
| same, but there are a few differences:</p> |
| <ul> |
| <li>Lower-level, non-declarative APIs (RDDs) can no longer be directly used from Spark Client |
| applications. Alternatives for missing RDD functionality are provided as part of the higher-level |
| DataFrame API.</li> |
| <li>Client applications no longer have direct access to the Spark driver JVM; they are fully |
| separated from the server.</li> |
| </ul> |
| |
| <p>Client applications based on Spark Connect can be submitted in the same way as any previous job. |
| In addition, Spark Client Applications based on Spark Connect have several benefits compared to |
| classic Spark applications using earlier Spark versions (3.4 and below):</p> |
| <ul> |
| <li><em>Upgradability</em>: Upgrading to new Spark Server versions is seamless, as the Spark Connect API |
| abstracts any changes/improvements on the server side. Client- and server APIs are cleanly |
| separated.</li> |
| <li><em>Simplicity</em>: The number of APIs exposed to the user is reduced from 3 to 2. The Spark Connect API |
| is fully declarative and consequently easy to learn for new users familiar with SQL.</li> |
| <li><em>Stability</em>: When using Spark Connect, the client applications no longer run on the Spark driver |
| and, therefore don’t cause and are not affected by any instability on the server.</li> |
| <li><em>Remote connectivity</em>: The decoupled architecture allows remote connectivity to Spark beyond SQL |
| and JDBC: any application can now interactively use Spark “as a service”.</li> |
| <li><em>Backwards compatibility</em>: The Spark Connect API is code-compatible with earlier Spark versions, |
| except for the usage of RDDs, for which a list of alternative APIs is provided in Spark Connect.</li> |
| </ul> |
| |
| <h2 id="spark-server-libraries">Spark Server Libraries</h2> |
| |
| <p>Until Spark 3.4, extensions to Spark (e.g., <a href="ml-guide#:~:text=What%20is%20%E2%80%9CSpark%20ML%E2%80%9D%3F,to%20emphasize%20the%20pipeline%20concept.">Spark ML</a> |
| or <a href="https://github.com/JohnSnowLabs/spark-nlp">Spark-NLP</a>) were built and deployed like Spark |
| Client Applications. With Spark 3.4 and Spark Connect, explicit extension points are offered to |
| extend Spark via Spark Server Libraries. These extension points provide functionality that can be |
| exposed to a client, which differs from existing extension points in Spark such as |
| <a href="api/java/org/apache/spark/sql/SparkSessionExtensions.html">SparkSession extensions</a> or |
| <a href="api/java/org/apache/spark/api/plugin/SparkPlugin.html">Spark Plugins</a>.</p> |
| |
| <h3 id="getting-started-extending-spark-with-spark-server-libraries">Getting Started: Extending Spark with Spark Server Libraries</h3> |
| |
| <p>Spark Connect is available and supports PySpark and Scala |
| applications. We will walk through how to run an Apache Spark server with Spark |
| Connect and connect to it from a client application using the Spark Connect client |
| library.</p> |
| |
| <p>A Spark Server Library consists of the following components, illustrated in Fig. 2:</p> |
| |
| <ol> |
| <li>The Spark Connect protocol extension (blue box <em>Proto</em> API)</li> |
| <li>A Spark Connect Plugin.</li> |
| <li>The application logic that extends Spark.</li> |
| <li>The client package that exposes the Spark Server Library application logic to the Spark Client |
| Application, alongside PySpark or the Scala Spark Client.</li> |
| </ol> |
| <p style="text-align: center;"> |
| <img src="img/extending-spark-connect-labelled.png" title="Figure 2: Labelled Architecture" alt="Extending Spark |
| Connect Diagram - Labelled Steps" /> |
| </p> |
| |
| <h4 id="1-spark-connect-protocol-extension">(1) Spark Connect Protocol Extension</h4> |
| |
| <p>To extend Spark with a new Spark Server Library, developers can extend the three main operation |
| types in the Spark Connect protocol: <em>Relation</em>, <em>Expression</em>, and <em>Command</em>.</p> |
| |
| <figure class="highlight"><pre><code class="language-protobuf" data-lang="protobuf"><span class="kd">message</span> <span class="nc">Relation</span> <span class="p">{</span> |
| <span class="k">oneof</span> <span class="n">rel_type</span> <span class="p">{</span> |
| <span class="n">Read</span> <span class="na">read</span> <span class="o">=</span> <span class="mi">1</span><span class="p">;</span> |
| <span class="c1">// ...</span> |
| <span class="n">google.protobuf.Any</span> <span class="na">extension</span> <span class="o">=</span> <span class="mi">998</span><span class="p">;</span> |
| <span class="p">}</span> |
| <span class="p">}</span> |
| |
| <span class="kd">message</span> <span class="nc">Expression</span> <span class="p">{</span> |
| <span class="k">oneof</span> <span class="n">expr_type</span> <span class="p">{</span> |
| <span class="n">Literal</span> <span class="na">literal</span> <span class="o">=</span> <span class="mi">1</span><span class="p">;</span> |
| <span class="c1">// ...</span> |
| <span class="n">google.protobuf.Any</span> <span class="na">extension</span> <span class="o">=</span> <span class="mi">999</span><span class="p">;</span> |
| <span class="p">}</span> |
| <span class="p">}</span> |
| |
| <span class="kd">message</span> <span class="nc">Command</span> <span class="p">{</span> |
| <span class="k">oneof</span> <span class="n">command_type</span> <span class="p">{</span> |
| <span class="n">WriteCommand</span> <span class="na">write_command</span> <span class="o">=</span> <span class="mi">1</span><span class="p">;</span> |
| <span class="c1">// ...</span> |
| <span class="n">google.protobuf.Any</span> <span class="na">extension</span> <span class="o">=</span> <span class="mi">999</span><span class="p">;</span> |
| <span class="p">}</span> |
| <span class="p">}</span> </code></pre></figure> |
| |
| <p>Their extension fields allow serializing arbitrary protobuf messages as part of the Spark Connect |
| protocol. These messages represent the parameters or state of the extension implementation. |
| To build a custom expression type, the developer first defines the custom protobuf definition |
| of the expression.</p> |
| |
| <figure class="highlight"><pre><code class="language-protobuf" data-lang="protobuf"><span class="kd">message</span> <span class="nc">ExamplePluginExpression</span> <span class="p">{</span> |
| <span class="n">Expression</span> <span class="na">child</span> <span class="o">=</span> <span class="mi">1</span><span class="p">;</span> |
| <span class="kt">string</span> <span class="na">custom_field</span> <span class="o">=</span> <span class="mi">2</span><span class="p">;</span> |
| <span class="p">}</span></code></pre></figure> |
| |
| <h4 id="2-spark-connect-plugin-implementation-with-3-custom-application-logic">(2) Spark Connect Plugin implementation with (3) custom application logic</h4> |
| |
| <p>As a next step, the developer implements the <em>ExpressionPlugin</em> class of Spark Connect with custom |
| application logic based on the input parameters of the protobuf message.</p> |
| |
| <figure class="highlight"><pre><code class="language-protobuf" data-lang="protobuf"><span class="n">class</span> <span class="n">ExampleExpressionPlugin</span> <span class="n">extends</span> <span class="n">ExpressionPlugin</span> <span class="p">{</span> |
| <span class="n">override</span> <span class="n">def</span> <span class="n">transform</span><span class="p">(</span> |
| <span class="n">relation</span><span class="o">:</span> <span class="n">protobuf.Any</span><span class="p">,</span> |
| <span class="n">planner</span><span class="o">:</span> <span class="n">SparkConnectPlanner</span><span class="p">)</span><span class="o">:</span> <span class="n">Option</span><span class="p">[</span><span class="n">Expression</span><span class="p">]</span> <span class="o">=</span> <span class="p">{</span> |
| <span class="c1">// Check if the serialized value of protobuf.Any matches the type</span> |
| <span class="c1">// of our example expression.</span> |
| <span class="n">if</span> <span class="p">(</span><span class="err">!</span><span class="n">relation.is</span><span class="p">(</span><span class="n">classOf</span><span class="p">[</span><span class="n">proto.ExamplePluginExpression</span><span class="p">]))</span> <span class="p">{</span> |
| <span class="n">return</span> <span class="n">None</span> |
| <span class="p">}</span> |
| <span class="n">val</span> <span class="na">exp</span> <span class="o">=</span> <span class="n">relation.unpack</span><span class="p">(</span><span class="n">classOf</span><span class="p">[</span><span class="n">proto.ExamplePluginExpression</span><span class="p">])</span> |
| <span class="n">Some</span><span class="p">(</span><span class="n">Alias</span><span class="p">(</span><span class="n">planner.transformExpression</span><span class="p">(</span> |
| <span class="n">exp.getChild</span><span class="p">),</span> <span class="n">exp.getCustomField</span><span class="p">)(</span><span class="na">explicitMetadata</span> <span class="o">=</span> <span class="n">None</span><span class="p">))</span> |
| <span class="p">}</span> |
| <span class="p">}</span></code></pre></figure> |
| |
| <p>Once the application logic is developed, the code must be packaged as a jar and Spark must be |
| configured to pick up the additional logic. The relevant Spark configuration options are:</p> |
| <ul> |
| <li><em>spark.jars</em> which define the location of the Jar file containing the application logic built for |
| the custom expression.</li> |
| <li><em>spark.connect.extensions.expression.classes</em> specifying the full class name |
| of each expression extension loaded by Spark. Based on these configuration options, Spark will |
| load the values at startup and make them available for processing.</li> |
| </ul> |
| |
| <h4 id="4-spark-server-library-client-package">(4) Spark Server Library Client Package</h4> |
| |
| <p>Once the server component is deployed, any client can use it with the right protobuf messages. |
| In the example above, the following message payload sent to the Spark Connect endpoint would be |
| enough to trigger the extension mechanism.</p> |
| |
| <figure class="highlight"><pre><code class="language-json" data-lang="json"><span class="p">{</span><span class="w"> |
| </span><span class="nl">"project"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span><span class="w"> |
| </span><span class="nl">"input"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span><span class="w"> |
| </span><span class="nl">"sql"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span><span class="w"> |
| </span><span class="nl">"query"</span><span class="p">:</span><span class="w"> </span><span class="s2">"select * from samples.nyctaxi.trips"</span><span class="w"> |
| </span><span class="p">}</span><span class="w"> |
| </span><span class="p">},</span><span class="w"> |
| </span><span class="nl">"expressions"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span><span class="w"> |
| </span><span class="p">{</span><span class="w"> |
| </span><span class="nl">"extension"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span><span class="w"> |
| </span><span class="nl">"typeUrl"</span><span class="p">:</span><span class="w"> </span><span class="s2">"type.googleapis.com/spark.connect.ExamplePluginExpression"</span><span class="p">,</span><span class="w"> |
| </span><span class="nl">"value"</span><span class="p">:</span><span class="w"> </span><span class="s2">"</span><span class="se">\n\0</span><span class="s2">06</span><span class="se">\0</span><span class="s2">22</span><span class="se">\0</span><span class="s2">04</span><span class="se">\n\0</span><span class="s2">02id</span><span class="se">\0</span><span class="s2">22</span><span class="se">\0</span><span class="s2">06testval"</span><span class="w"> |
| </span><span class="p">}</span><span class="w"> |
| </span><span class="p">}</span><span class="w"> |
| </span><span class="p">]</span><span class="w"> |
| </span><span class="p">}</span><span class="w"> |
| </span><span class="p">}</span><span class="w"> </span></code></pre></figure> |
| |
| <p>To make the example available in Python, the application developer provides a Python library that |
| wraps the new expression and embeds it into PySpark. The easiest way to provide a function for any |
| expression is to take a PySpark column instance as an argument and return a new Column instance |
| with the expression applied.</p> |
| |
| <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="n">pyspark.sql.connect.column</span> <span class="kn">import</span> <span class="n">Expression</span> |
| <span class="kn">import</span> <span class="n">pyspark.sql.connect.proto</span> <span class="k">as</span> <span class="n">proto</span> |
| |
| <span class="kn">from</span> <span class="n">myxample.proto</span> <span class="kn">import</span> <span class="n">ExamplePluginExpression</span> |
| |
| <span class="c1"># Internal class that satisfies the interface by the Python client |
| # of Spark Connect to generate the protobuf representation from |
| # an instance of the expression. |
| </span><span class="k">class</span> <span class="nc">ExampleExpression</span><span class="p">(</span><span class="n">Expression</span><span class="p">):</span> |
| <span class="k">def</span> <span class="nf">to_plan</span><span class="p">(</span><span class="n">self</span><span class="p">,</span> <span class="n">session</span><span class="p">)</span> <span class="o">-></span> <span class="n">proto</span><span class="p">.</span><span class="n">Expression</span><span class="p">:</span> |
| <span class="n">fun</span> <span class="o">=</span> <span class="n">proto</span><span class="p">.</span><span class="nc">Expression</span><span class="p">()</span> |
| <span class="n">plugin</span> <span class="o">=</span> <span class="nc">ExamplePluginExpression</span><span class="p">()</span> |
| <span class="n">plugin</span><span class="p">.</span><span class="n">child</span><span class="p">.</span><span class="n">literal</span><span class="p">.</span><span class="nb">long</span> <span class="o">=</span> <span class="mi">10</span> |
| <span class="n">plugin</span><span class="p">.</span><span class="n">custom_field</span> <span class="o">=</span> <span class="sh">"</span><span class="s">example</span><span class="sh">"</span> |
| <span class="n">fun</span><span class="p">.</span><span class="n">extension</span><span class="p">.</span><span class="nc">Pack</span><span class="p">(</span><span class="n">plugin</span><span class="p">)</span> |
| <span class="k">return</span> <span class="n">fun</span> |
| |
| <span class="c1"># Defining the function to be used from the consumers. |
| </span><span class="k">def</span> <span class="nf">example_expression</span><span class="p">(</span><span class="n">col</span><span class="p">:</span> <span class="n">Column</span><span class="p">)</span> <span class="o">-></span> <span class="n">Column</span><span class="p">:</span> |
| <span class="k">return</span> <span class="nc">Column</span><span class="p">(</span><span class="nc">ExampleExpression</span><span class="p">())</span> |
| |
| |
| <span class="c1"># Using the expression in the Spark Connect client code. |
| </span><span class="n">df</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="nf">table</span><span class="p">(</span><span class="sh">"</span><span class="s">samples.nyctaxi.trips</span><span class="sh">"</span><span class="p">)</span> |
| <span class="n">df</span><span class="p">.</span><span class="nf">select</span><span class="p">(</span><span class="nf">example_expression</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="sh">"</span><span class="s">fare_amount</span><span class="sh">"</span><span class="p">])).</span><span class="nf">collect</span><span class="p">()</span></code></pre></figure> |
| |
| |
| </div> |
| |
| <!-- /container --> |
| </div> |
| |
| <script src="js/vendor/jquery-3.5.1.min.js"></script> |
| <script src="js/vendor/bootstrap.bundle.min.js"></script> |
| |
| <script src="js/vendor/anchor.min.js"></script> |
| <script src="js/main.js"></script> |
| |
| <script type="text/javascript" src="js/vendor/docsearch.min.js"></script> |
| <script type="text/javascript"> |
| // DocSearch is entirely free and automated. DocSearch is built in two parts: |
| // 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link |
| // in your website and extract content from every page it traverses. It then pushes this |
| // content to an Algolia index. |
| // 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index |
| // to your search input and display its results in a dropdown UI. If you want to find more |
| // details on how works DocSearch, check the docs of DocSearch. |
| docsearch({ |
| apiKey: 'd62f962a82bc9abb53471cb7b89da35e', |
| appId: 'RAI69RXRSK', |
| indexName: 'apache_spark', |
| inputSelector: '#docsearch-input', |
| enhancedSearchInput: true, |
| algoliaOptions: { |
| 'facetFilters': ["version:4.1.0-preview1"] |
| }, |
| debug: false // Set debug to true if you want to inspect the dropdown |
| }); |
| |
| </script> |
| |
| <!-- MathJax Section --> |
| <script type="text/x-mathjax-config"> |
| MathJax.Hub.Config({ |
| TeX: { equationNumbers: { autoNumber: "AMS" } } |
| }); |
| </script> |
| <script> |
| // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS. |
| // We could use "//cdn.mathjax...", but that won't support "file://". |
| (function(d, script) { |
| script = d.createElement('script'); |
| script.type = 'text/javascript'; |
| script.async = true; |
| script.onload = function(){ |
| MathJax.Hub.Config({ |
| tex2jax: { |
| inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ], |
| displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], |
| processEscapes: true, |
| skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] |
| } |
| }); |
| }; |
| script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + |
| 'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' + |
| '?config=TeX-AMS-MML_HTMLorMML'; |
| d.getElementsByTagName('head')[0].appendChild(script); |
| }(document)); |
| </script> |
| </body> |
| </html> |