blob: b4aa666d3db4e2947760ae1c8791e60c7dd08530 [file] [log] [blame]
<!doctype html><html lang=en class=no-js><head><meta charset=utf-8><meta http-equiv=x-ua-compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1"><title>Apache Spark Runner</title><meta name=description content="Apache Beam is an open source, unified model and set of language-specific SDKs for defining and executing data processing workflows, and also data ingestion and integration flows, supporting Enterprise Integration Patterns (EIPs) and Domain Specific Languages (DSLs). Dataflow pipelines simplify the mechanics of large-scale batch and streaming data processing and can run on a number of runtimes like Apache Flink, Apache Spark, and Google Cloud Dataflow (a cloud service). Beam also brings DSL in different languages, allowing users to easily implement their data integration processes."><link href="https://fonts.googleapis.com/css?family=Roboto:100,300,400,500,700" rel=stylesheet><link rel=preload href=/scss/main.min.408fddfe3e8a45f87a5a8c9a839d77db667c1c534e5e5cd0d957ffc3dd6c14cf.css as=style><link href=/scss/main.min.408fddfe3e8a45f87a5a8c9a839d77db667c1c534e5e5cd0d957ffc3dd6c14cf.css rel=stylesheet integrity><script src=https://code.jquery.com/jquery-2.2.4.min.js></script><style>.body__contained img{max-width:100%}</style><script type=text/javascript src=/js/bootstrap.min.2979f9a6e32fc42c3e7406339ee9fe76b31d1b52059776a02b4a7fa6a4fd280a.js defer></script>
<script type=text/javascript src=/js/language-switch-v2.min.121952b7980b920320ab229551857669209945e39b05ba2b433a565385ca44c6.js defer></script>
<script type=text/javascript src=/js/fix-menu.min.039174b67107465f2090a493f91e126f7aa797f29420f9edab8a54d9dd4b3d2d.js defer></script>
<script type=text/javascript src=/js/section-nav.min.1405fd5e70fab5f6c54037c269b1d137487d8f3d1b3009032525f6db3fbce991.js defer></script>
<script type=text/javascript src=/js/page-nav.min.af231204c9c52c5089d53a4c02739eacbb7f939e3be1c6ffcc212e0ac4dbf879.js defer></script>
<script type=text/javascript src=/js/expandable-list.min.75a4526624a3b8898fe7fb9e3428c205b581f8b38c7926922467aef17eac69f2.js defer></script>
<script type=text/javascript src=/js/copy-to-clipboard.min.364c06423d7e8993fc42bb4abc38c03195bc8386db26d18774ce775d08d5b18d.js defer></script>
<script type=text/javascript src=/js/calendar.min.336664054fa0f52b08bbd4e3c59b5cb6d63dcfb2b4d602839746516b0817446b.js defer></script>
<script type=text/javascript src=/js/fix-playground-nested-scroll.min.0283f1037cb1b9d5074c6eaf041292b524a8148a7cdb803d5ccd6d1fc4eb3253.js defer></script>
<script type=text/javascript src=/js/anchor-content-jump-fix.min.22d3240f81632e4c11179b9d2aaf37a40da9414333c43aa97344e8b21a7df0e4.js defer></script>
<link rel=alternate type=application/rss+xml title="Apache Beam" href=/feed.xml><link rel=canonical href=/documentation/runners/spark/ data-proofer-ignore><link rel="shortcut icon" type=image/x-icon href=/images/favicon.ico><link rel=stylesheet href=https://use.fontawesome.com/releases/v5.4.1/css/all.css integrity=sha384-5sAR7xN1Nv6T6+dT2mhtzEpVJvfS3NScPQTrOxhwjIuvcA67KV2R5Jz6kr4abQsz crossorigin=anonymous><link rel=stylesheet href=https://unpkg.com/swiper@8/swiper-bundle.min.css><script async src=https://platform.twitter.com/widgets.js></script>
<script>(function(e,t,n,s,o,i,a){e.GoogleAnalyticsObject=o,e[o]=e[o]||function(){(e[o].q=e[o].q||[]).push(arguments)},e[o].l=1*new Date,i=t.createElement(n),a=t.getElementsByTagName(n)[0],i.async=1,i.src=s,a.parentNode.insertBefore(i,a)})(window,document,"script","//www.google-analytics.com/analytics.js","ga"),ga("create","UA-73650088-1","auto"),ga("send","pageview")</script><script>(function(e,t,n,s,o,i){e.hj=e.hj||function(){(e.hj.q=e.hj.q||[]).push(arguments)},e._hjSettings={hjid:2182187,hjsv:6},o=t.getElementsByTagName("head")[0],i=t.createElement("script"),i.async=1,i.src=n+e._hjSettings.hjid+s+e._hjSettings.hjsv,o.appendChild(i)})(window,document,"https://static.hotjar.com/c/hotjar-",".js?sv=")</script></head><body class=body data-spy=scroll data-target=.page-nav data-offset=0><nav class="navigation-bar-mobile header navbar navbar-fixed-top"><div class=navbar-header><a href=/ class=navbar-brand><img alt=Brand style=height:46px;width:43px src=/images/beam_logo_navbar_mobile.png></a>
<a class=navbar-link href=/get-started/>Get Started</a>
<a class=navbar-link href=/documentation/>Documentation</a>
<button type=button class="navbar-toggle menu-open" aria-expanded=false aria-controls=navbar onclick=openMenu()>
<span class=sr-only>Toggle navigation</span>
<span class=icon-bar></span>
<span class=icon-bar></span>
<span class=icon-bar></span></button></div><div class="navbar-mask closed"></div><div id=navbar class="navbar-container closed"><button type=button class=navbar-toggle aria-expanded=false aria-controls=navbar id=closeMenu>
<span class=sr-only>Toggle navigation</span>
<span class=icon-bar></span>
<span class=icon-bar></span>
<span class=icon-bar></span></button><ul class="nav navbar-nav"><li><div class=searchBar-mobile><script>(function(){var t,n="012923275103528129024:4emlchv9wzi",e=document.createElement("script");e.type="text/javascript",e.async=!0,e.src="https://cse.google.com/cse.js?cx="+n,t=document.getElementsByTagName("script")[0],t.parentNode.insertBefore(e,t)})()</script><gcse:search></gcse:search></div></li><li><a class=navbar-link href=/about>About</a></li><li><a class=navbar-link href=/get-started/>Get Started</a></li><li><span class=navbar-link>Documentation</span><ul><li><a href=/documentation/>General</a></li><li><a href=/documentation/sdks/java/>Languages</a></li><li><a href=/documentation/runners/capability-matrix/>Runners</a></li><li><a href=/documentation/io/connectors/>I/O Connectors</a></li></ul></li><li><a class=navbar-link href=/roadmap/>Roadmap</a></li><li><a class=navbar-link href=/community/>Community</a></li><li><a class=navbar-link href=/contribute/>Contribute</a></li><li><a class=navbar-link href=/blog/>Blog</a></li><li><a class=navbar-link href=/case-studies/>Case Studies</a></li></ul><ul class="nav navbar-nav navbar-right"><li><a href=https://github.com/apache/beam/edit/master/website/www/site/content/en/documentation/runners/spark.md data-proofer-ignore><svg xmlns="http://www.w3.org/2000/svg" width="25" height="24" fill="none" viewBox="0 0 25 24"><path stroke="#ff6d00" stroke-linecap="round" stroke-linejoin="round" stroke-width="2.75" d="M4.543 20h4l10.5-10.5c.53-.53.828-1.25.828-2s-.298-1.47-.828-2-1.25-.828-2-.828-1.47.298-2 .828L4.543 16v4zm9.5-13.5 4 4"/></svg></a></li><li class=dropdown><a href=# class=dropdown-toggle id=apache-dropdown data-toggle=dropdown role=button aria-haspopup=true aria-expanded=false><img src=https://www.apache.org/foundation/press/kit/feather_small.png alt="Apache Logo" style=height:20px>
&nbsp;Apache
<span class=arrow-icon><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" fill="none" viewBox="0 0 20 20"><circle cx="10" cy="10" r="10" fill="#ff6d00"/><path stroke="#fff" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M8.535 5.28l4.573 4.818-4.573 4.403"/></svg></span></a><ul class="dropdown-menu dropdown-menu-right"><li><a target=_blank href=https://www.apache.org/>ASF Homepage</a></li><li><a target=_blank href=https://www.apache.org/licenses/>License</a></li><li><a target=_blank href=https://www.apache.org/security/>Security</a></li><li><a target=_blank href=https://www.apache.org/foundation/thanks.html>Thanks</a></li><li><a target=_blank href=https://www.apache.org/foundation/sponsorship.html>Sponsorship</a></li><li><a target=_blank href=https://www.apache.org/foundation/policies/conduct>Code of Conduct</a></li></ul></li></ul></div></nav><nav class=navigation-bar-desktop><a href=/ class=navbar-logo><img src=/images/beam_logo_navbar.png alt="Beam Logo"></a><div class=navbar-bar-left><div class=navbar-links><a class=navbar-link href=/about>About</a>
<a class=navbar-link href=/get-started/>Get Started</a><li class="dropdown navbar-dropdown navbar-dropdown-documentation"><a href=# class="dropdown-toggle navbar-link" role=button aria-haspopup=true aria-expanded=false>Documentation
<span><svg xmlns="http://www.w3.org/2000/svg" width="12" height="11" fill="none" viewBox="0 0 12 11"><path stroke="#ff6d00" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10.666 4.535 5.847 9.108 1.444 4.535"/></svg></span></a><ul class=dropdown-menu><li><a class=navbar-dropdown-menu-link href=/documentation/>General</a></li><li><a class=navbar-dropdown-menu-link href=/documentation/sdks/java/>Languages</a></li><li><a class=navbar-dropdown-menu-link href=/documentation/runners/capability-matrix/>Runners</a></li><li><a class=navbar-dropdown-menu-link href=/documentation/io/connectors/>I/O Connectors</a></li></ul></li><a class=navbar-link href=/roadmap/>Roadmap</a>
<a class=navbar-link href=/community/>Community</a>
<a class=navbar-link href=/contribute/>Contribute</a>
<a class=navbar-link href=/blog/>Blog</a>
<a class=navbar-link href=/case-studies/>Case Studies</a></div><div id=iconsBar><a type=button onclick=showSearch()><svg xmlns="http://www.w3.org/2000/svg" width="25" height="24" fill="none" viewBox="0 0 25 24"><path stroke="#ff6d00" stroke-linecap="round" stroke-linejoin="round" stroke-width="2.75" d="M10.191 17c3.866.0 7-3.134 7-7s-3.134-7-7-7-7 3.134-7 7 3.134 7 7 7zm11 4-6-6"/></svg></a><a target=_blank href=https://github.com/apache/beam/edit/master/website/www/site/content/en/documentation/runners/spark.md data-proofer-ignore><svg xmlns="http://www.w3.org/2000/svg" width="25" height="24" fill="none" viewBox="0 0 25 24"><path stroke="#ff6d00" stroke-linecap="round" stroke-linejoin="round" stroke-width="2.75" d="M4.543 20h4l10.5-10.5c.53-.53.828-1.25.828-2s-.298-1.47-.828-2-1.25-.828-2-.828-1.47.298-2 .828L4.543 16v4zm9.5-13.5 4 4"/></svg></a><li class="dropdown navbar-dropdown navbar-dropdown-apache"><a href=# class=dropdown-toggle role=button aria-haspopup=true aria-expanded=false><img src=https://www.apache.org/foundation/press/kit/feather_small.png alt="Apache Logo" style=height:20px>
&nbsp;Apache
<span class=arrow-icon><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" fill="none" viewBox="0 0 20 20"><circle cx="10" cy="10" r="10" fill="#ff6d00"/><path stroke="#fff" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M8.535 5.28l4.573 4.818-4.573 4.403"/></svg></span></a><ul class=dropdown-menu><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/>ASF Homepage</a></li><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/licenses/>License</a></li><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/security/>Security</a></li><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/foundation/thanks.html>Thanks</a></li><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/foundation/sponsorship.html>Sponsorship</a></li><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/foundation/policies/conduct>Code of Conduct</a></li></ul></li></div><div class="searchBar disappear"><script>(function(){var t,n="012923275103528129024:4emlchv9wzi",e=document.createElement("script");e.type="text/javascript",e.async=!0,e.src="https://cse.google.com/cse.js?cx="+n,t=document.getElementsByTagName("script")[0],t.parentNode.insertBefore(e,t)})()</script><gcse:search></gcse:search>
<a type=button onclick=endSearch()><svg xmlns="http://www.w3.org/2000/svg" width="25" height="25" fill="none" viewBox="0 0 25 25"><path stroke="#ff6d00" stroke-linecap="round" stroke-linejoin="round" stroke-width="2.75" d="M21.122 20.827 4.727 4.432M21.122 4.43 4.727 20.827"/></svg></a></div></div></nav><div class=header-push></div><div class="top-banners swiper"><div class=swiper-wrapper><div class=swiper-slide><a href=https://tour.beam.apache.org><img class=banner-img-desktop src=/images/banners/tour-of-beam/tour-of-beam-desktop.png alt="Start Tour of Beam">
<img class=banner-img-mobile src=/images/banners/tour-of-beam/tour-of-beam-mobile.png alt="Start Tour of Beam"></a></div><div class=swiper-slide><a href=https://beam.apache.org/documentation/ml/overview/><img class=banner-img-desktop src=/images/banners/machine-learning/machine-learning-desktop.jpg alt="Machine Learning">
<img class=banner-img-mobile src=/images/banners/machine-learning/machine-learning-mobile.jpg alt="Machine Learning"></a></div></div><div class=swiper-pagination></div><div class=swiper-button-prev></div><div class=swiper-button-next></div></div><script src=/js/swiper-bundle.min.min.e0e8f81b0b15728d35ff73c07f42ddbb17a108d6f23df4953cb3e60df7ade675.js></script>
<script src=/js/sliders/top-banners.min.afa7d0a19acf7a3b28ca369490b3d401a619562a2a4c9612577be2f66a4b9855.js></script>
<script>function showSearch(){addPlaceholder();var e,t=document.querySelector(".searchBar");t.classList.remove("disappear"),e=document.querySelector("#iconsBar"),e.classList.add("disappear")}function addPlaceholder(){$("input:text").attr("placeholder","What are you looking for?")}function endSearch(){var e,t=document.querySelector(".searchBar");t.classList.add("disappear"),e=document.querySelector("#iconsBar"),e.classList.remove("disappear")}function blockScroll(){$("body").toggleClass("fixedPosition")}function openMenu(){addPlaceholder(),blockScroll()}</script><div class="clearfix container-main-content"><div class="section-nav closed" data-offset-top=90 data-offset-bottom=500><span class="section-nav-back glyphicon glyphicon-menu-left"></span><nav><ul class=section-nav-list data-section-nav><li><span class=section-nav-list-main-title>Runners</span></li><li><a href=/documentation/runners/capability-matrix/>Capability Matrix</a></li><li><a href=/documentation/runners/direct/>Direct Runner</a></li><li><a href=/documentation/runners/flink/>Apache Flink</a></li><li><a href=/documentation/runners/nemo/>Apache Nemo</a></li><li><a href=/documentation/runners/samza/>Apache Samza</a></li><li><a href=/documentation/runners/spark/>Apache Spark</a></li><li><a href=/documentation/runners/dataflow/>Google Cloud Dataflow</a></li><li><a href=/documentation/runners/jet/>Hazelcast Jet</a></li><li><a href=/documentation/runners/twister2/>Twister2</a></li></ul></nav></div><nav class="page-nav clearfix" data-offset-top=90 data-offset-bottom=500><nav id=TableOfContents><ul><li><a href=#three-flavors-of-the-spark-runner>Three flavors of the Spark runner</a></li><li><a href=#which-runner-to-use-portable-or-non-portable-runner>Which runner to use: portable or non portable runner?</a></li><li><a href=#spark-runner-prerequisites-and-setup>Spark Runner prerequisites and setup</a><ul><li><a href=#deploying-spark-with-your-application>Deploying Spark with your application</a></li><li><a href=#running-on-a-pre-deployed-spark-cluster>Running on a pre-deployed Spark cluster</a></li><li><a href=#running-on-dataproc-cluster-yarn-backed>Running on Dataproc cluster (YARN backed)</a></li></ul></li><li><a href=#pipeline-options-for-the-spark-runner>Pipeline options for the Spark Runner</a></li><li><a href=#additional-notes>Additional notes</a><ul><li><a href=#using-spark-submit>Using spark-submit</a></li><li><a href=#monitoring-your-job>Monitoring your job</a></li><li><a href=#streaming-execution>Streaming Execution</a></li><li><a href=#using-a-provided-sparkcontext-and-streaminglisteners>Using a provided SparkContext and StreamingListeners</a></li><li><a href=#kubernetes>Kubernetes</a><ul><li><a href=#submit-beam-job-without-job-server>Submit beam job without job server</a></li><li><a href=#submit-beam-job-with-job-server>Submit beam job with job server</a></li></ul></li></ul></li></ul></nav></nav><div class="body__contained body__section-nav"><h1 id=using-the-apache-spark-runner>Using the Apache Spark Runner</h1><p>The Apache Spark Runner can be used to execute Beam pipelines using <a href=https://spark.apache.org/>Apache Spark</a>.
The Spark Runner can execute Spark pipelines just like a native Spark application; deploying a self-contained application for local mode, running on Spark&rsquo;s Standalone RM, or using YARN or Mesos.</p><p>The Spark Runner executes Beam pipelines on top of Apache Spark, providing:</p><ul><li>Batch and streaming (and combined) pipelines.</li><li>The same fault-tolerance <a href=https://spark.apache.org/docs/latest/streaming-programming-guide.html#fault-tolerance-semantics>guarantees</a> as provided by RDDs and DStreams.</li><li>The same <a href=https://spark.apache.org/docs/latest/security.html>security</a> features Spark provides.</li><li>Built-in metrics reporting using Spark&rsquo;s metrics system, which reports Beam Aggregators as well.</li><li>Native support for Beam side-inputs via spark&rsquo;s Broadcast variables.</li></ul><p>The <a href=/documentation/runners/capability-matrix/>Beam Capability Matrix</a> documents the currently supported capabilities of the Spark Runner.</p><h2 id=three-flavors-of-the-spark-runner>Three flavors of the Spark runner</h2><p>The Spark runner comes in three flavors:</p><ol><li>A <em>legacy Runner</em> which supports only Java (and other JVM-based languages) and that is based on Spark RDD/DStream</li><li>An <em>Structured Streaming Spark Runner</em> which supports only Java (and other JVM-based languages) and that is based on Spark Datasets and the <a href=https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html>Apache Spark Structured Streaming</a> framework.</li></ol><blockquote><p><strong>Note:</strong> It is still experimental, its coverage of the Beam model is partial. As for now it only supports batch mode.</p></blockquote><ol start=3><li>A <em>portable Runner</em> which supports Java, Python, and Go</li></ol><p>This guide is split into two parts to document the non-portable and
the portable functionality of the Spark Runner. Please use the switcher below to
select the appropriate Runner:</p><h2 id=which-runner-to-use-portable-or-non-portable-runner>Which runner to use: portable or non portable runner?</h2><p>Beam and its Runners originally only supported JVM-based languages
(e.g. Java/Scala/Kotlin). Python and Go SDKs were added later on. The
architecture of the Runners had to be changed significantly to support executing
pipelines written in other languages.</p><p>If your applications only use Java, then you should currently go with one of the java based runners.
If you want to run Python or Go pipelines with Beam on Spark, you need to use
the portable Runner. For more information on portability, please visit the
<a href=/roadmap/portability/>Portability page</a>.</p><nav class=language-switcher><strong>Adapt for:</strong><ul><li data-value=java>Non portable (Java)</li><li data-value=py>Portable (Java/Python/Go)</li></ul></nav><h2 id=spark-runner-prerequisites-and-setup>Spark Runner prerequisites and setup</h2><p>The Spark runner currently supports Spark&rsquo;s 3.2.x branch.</p><blockquote><p><strong>Note:</strong> Support for Spark 2.4.x was dropped with Beam 2.46.0.</p></blockquote><p class=language-java>You can add a dependency on the latest version of the Spark runner by adding to your pom.xml the following:</p><div class='language-java snippet'><div class="notebook-skip code-snippet"><a class=copy type=button data-bs-toggle=tooltip data-bs-placement=bottom title="Copy to clipboard"><img src=/images/copy-icon.svg></a><div class=highlight><pre tabindex=0 class=chroma><code class=language-java data-lang=java><span class=line><span class=cl><span class=o>&lt;</span><span class=n>dependency</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>groupId</span><span class=o>&gt;</span><span class=n>org</span><span class=o>.</span><span class=na>apache</span><span class=o>.</span><span class=na>beam</span><span class=o>&lt;/</span><span class=n>groupId</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>artifactId</span><span class=o>&gt;</span><span class=n>beam</span><span class=o>-</span><span class=n>runners</span><span class=o>-</span><span class=n>spark</span><span class=o>-</span><span class=n>3</span><span class=o>&lt;/</span><span class=n>artifactId</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>version</span><span class=o>&gt;</span><span class=n>2</span><span class=o>.</span><span class=na>56</span><span class=o>.</span><span class=na>0</span><span class=o>&lt;/</span><span class=n>version</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl><span class=o>&lt;/</span><span class=n>dependency</span><span class=o>&gt;</span></span></span></code></pre></div></div></div><h3 id=deploying-spark-with-your-application>Deploying Spark with your application</h3><p class=language-java>In some cases, such as running in local mode/Standalone, your (self-contained) application would be required to pack Spark by explicitly adding the following dependencies in your pom.xml:</p><div class='language-java snippet'><div class="notebook-skip code-snippet"><a class=copy type=button data-bs-toggle=tooltip data-bs-placement=bottom title="Copy to clipboard"><img src=/images/copy-icon.svg></a><div class=highlight><pre tabindex=0 class=chroma><code class=language-java data-lang=java><span class=line><span class=cl><span class=o>&lt;</span><span class=n>dependency</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>groupId</span><span class=o>&gt;</span><span class=n>org</span><span class=o>.</span><span class=na>apache</span><span class=o>.</span><span class=na>spark</span><span class=o>&lt;/</span><span class=n>groupId</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>artifactId</span><span class=o>&gt;</span><span class=n>spark</span><span class=o>-</span><span class=n>core_2</span><span class=o>.</span><span class=na>12</span><span class=o>&lt;/</span><span class=n>artifactId</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>version</span><span class=o>&gt;</span><span class=n>$</span><span class=o>{</span><span class=n>spark</span><span class=o>.</span><span class=na>version</span><span class=o>}&lt;/</span><span class=n>version</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl><span class=o>&lt;/</span><span class=n>dependency</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl>
</span></span><span class=line><span class=cl><span class=o>&lt;</span><span class=n>dependency</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>groupId</span><span class=o>&gt;</span><span class=n>org</span><span class=o>.</span><span class=na>apache</span><span class=o>.</span><span class=na>spark</span><span class=o>&lt;/</span><span class=n>groupId</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>artifactId</span><span class=o>&gt;</span><span class=n>spark</span><span class=o>-</span><span class=n>streaming_2</span><span class=o>.</span><span class=na>12</span><span class=o>&lt;/</span><span class=n>artifactId</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>version</span><span class=o>&gt;</span><span class=n>$</span><span class=o>{</span><span class=n>spark</span><span class=o>.</span><span class=na>version</span><span class=o>}&lt;/</span><span class=n>version</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl><span class=o>&lt;/</span><span class=n>dependency</span><span class=o>&gt;</span></span></span></code></pre></div></div></div><p class=language-java>And shading the application jar using the maven shade plugin:</p><div class='language-java snippet'><div class="notebook-skip code-snippet"><a class=copy type=button data-bs-toggle=tooltip data-bs-placement=bottom title="Copy to clipboard"><img src=/images/copy-icon.svg></a><div class=highlight><pre tabindex=0 class=chroma><code class=language-java data-lang=java><span class=line><span class=cl><span class=o>&lt;</span><span class=n>plugin</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>groupId</span><span class=o>&gt;</span><span class=n>org</span><span class=o>.</span><span class=na>apache</span><span class=o>.</span><span class=na>maven</span><span class=o>.</span><span class=na>plugins</span><span class=o>&lt;/</span><span class=n>groupId</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>artifactId</span><span class=o>&gt;</span><span class=n>maven</span><span class=o>-</span><span class=n>shade</span><span class=o>-</span><span class=n>plugin</span><span class=o>&lt;/</span><span class=n>artifactId</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>configuration</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>createDependencyReducedPom</span><span class=o>&gt;</span><span class=kc>false</span><span class=o>&lt;/</span><span class=n>createDependencyReducedPom</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>filters</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>filter</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>artifact</span><span class=o>&gt;*:*&lt;/</span><span class=n>artifact</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>excludes</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>exclude</span><span class=o>&gt;</span><span class=n>META</span><span class=o>-</span><span class=n>INF</span><span class=o>/*.</span><span class=na>SF</span><span class=o>&lt;/</span><span class=n>exclude</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>exclude</span><span class=o>&gt;</span><span class=n>META</span><span class=o>-</span><span class=n>INF</span><span class=o>/*.</span><span class=na>DSA</span><span class=o>&lt;/</span><span class=n>exclude</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>exclude</span><span class=o>&gt;</span><span class=n>META</span><span class=o>-</span><span class=n>INF</span><span class=o>/*.</span><span class=na>RSA</span><span class=o>&lt;/</span><span class=n>exclude</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;/</span><span class=n>excludes</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;/</span><span class=n>filter</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;/</span><span class=n>filters</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;/</span><span class=n>configuration</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>executions</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>execution</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>phase</span><span class=o>&gt;</span><span class=n>package</span><span class=o>&lt;/</span><span class=n>phase</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>goals</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>goal</span><span class=o>&gt;</span><span class=n>shade</span><span class=o>&lt;/</span><span class=n>goal</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;/</span><span class=n>goals</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>configuration</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>shadedArtifactAttached</span><span class=o>&gt;</span><span class=kc>true</span><span class=o>&lt;/</span><span class=n>shadedArtifactAttached</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>shadedClassifierName</span><span class=o>&gt;</span><span class=n>shaded</span><span class=o>&lt;/</span><span class=n>shadedClassifierName</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>transformers</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>transformer</span>
</span></span><span class=line><span class=cl> <span class=n>implementation</span><span class=o>=</span><span class=s>&#34;org.apache.maven.plugins.shade.resource.ServicesResourceTransformer&#34;</span><span class=o>/&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;/</span><span class=n>transformers</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;/</span><span class=n>configuration</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;/</span><span class=n>execution</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;/</span><span class=n>executions</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl><span class=o>&lt;/</span><span class=n>plugin</span><span class=o>&gt;</span></span></span></code></pre></div></div></div><p class=language-java>After running <code>mvn package</code>, run <code>ls target</code> and you should see (assuming your artifactId is <code>beam-examples</code> and the version is <code>1.0.0</code>):</p><div class='language-java snippet'><div class="notebook-skip code-snippet"><a class=copy type=button data-bs-toggle=tooltip data-bs-placement=bottom title="Copy to clipboard"><img src=/images/copy-icon.svg></a><div class=highlight><pre tabindex=0 class=chroma><code class=language-java data-lang=java><span class=line><span class=cl><span class=n>beam</span><span class=o>-</span><span class=n>examples</span><span class=o>-</span><span class=n>1</span><span class=o>.</span><span class=na>0</span><span class=o>.</span><span class=na>0</span><span class=o>-</span><span class=n>shaded</span><span class=o>.</span><span class=na>jar</span></span></span></code></pre></div></div></div><p class=language-java>To run against a Standalone cluster simply run:</p><p class=language-java><br><b>For RDD/DStream based runner:</b><br></p><div class='language-java snippet'><div class="notebook-skip code-snippet"><a class=copy type=button data-bs-toggle=tooltip data-bs-placement=bottom title="Copy to clipboard"><img src=/images/copy-icon.svg></a><div class=highlight><pre tabindex=0 class=chroma><code class=language-java data-lang=java><span class=line><span class=cl><span class=n>spark</span><span class=o>-</span><span class=n>submit</span> <span class=o>--</span><span class=kd>class</span> <span class=nc>com</span><span class=o>.</span><span class=na>beam</span><span class=o>.</span><span class=na>examples</span><span class=o>.</span><span class=na>BeamPipeline</span> <span class=o>--</span><span class=n>master</span> <span class=n>spark</span><span class=o>:</span><span class=c1>//HOST:PORT target/beam-examples-1.0.0-shaded.jar --runner=SparkRunner
</span></span></span></code></pre></div></div></div><p class=language-java><br><b>For Structured Streaming based runner:</b><br></p><div class='language-java snippet'><div class="notebook-skip code-snippet"><a class=copy type=button data-bs-toggle=tooltip data-bs-placement=bottom title="Copy to clipboard"><img src=/images/copy-icon.svg></a><div class=highlight><pre tabindex=0 class=chroma><code class=language-java data-lang=java><span class=line><span class=cl><span class=n>spark</span><span class=o>-</span><span class=n>submit</span> <span class=o>--</span><span class=kd>class</span> <span class=nc>com</span><span class=o>.</span><span class=na>beam</span><span class=o>.</span><span class=na>examples</span><span class=o>.</span><span class=na>BeamPipeline</span> <span class=o>--</span><span class=n>master</span> <span class=n>spark</span><span class=o>:</span><span class=c1>//HOST:PORT target/beam-examples-1.0.0-shaded.jar --runner=SparkStructuredStreamingRunner
</span></span></span></code></pre></div></div></div><p class=language-py>You will need Docker to be installed in your execution environment. To develop
Apache Beam with Python you have to install the Apache Beam Python SDK: <code>pip install apache_beam</code>. Please refer to the <a href=/documentation/sdks/python/>Python documentation</a>
on how to create a Python pipeline.</p><div class='language-py snippet'><div class="notebook-skip code-snippet"><a class=copy type=button data-bs-toggle=tooltip data-bs-placement=bottom title="Copy to clipboard"><img src=/images/copy-icon.svg></a><div class=highlight><pre tabindex=0 class=chroma><code class=language-py data-lang=py><span class=line><span class=cl><span class=n>pip</span> <span class=n>install</span> <span class=n>apache_beam</span></span></span></code></pre></div></div></div><p class=language-py>Starting from Beam 2.20.0, pre-built Spark Job Service Docker images are available at
<a href=https://hub.docker.com/r/apache/beam_spark_job_server>Docker Hub</a>.</p><p class=language-py>For older Beam versions, you will need a copy of Apache Beam&rsquo;s source code. You can
download it on the <a href=/get-started/downloads/>Downloads page</a>.</p><p class=language-py><ol><li>Start the JobService endpoint:<ul><li>with Docker (preferred): <code>docker run --net=host apache/beam_spark_job_server:latest</code></li><li>or from Beam source code: <code>./gradlew :runners:spark:3:job-server:runShadow</code></li></ul></li></ol></p><p class=language-py>The JobService is the central instance where you submit your Beam pipeline.
The JobService will create a Spark job for the pipeline and execute the
job. To execute the job on a Spark cluster, the Beam JobService needs to be
provided with the Spark master address.</p><p class=language-py><ol start=2><li>Submit the Python pipeline to the above endpoint by using the <code>PortableRunner</code>, <code>job_endpoint</code> set to <code>localhost:8099</code> (this is the default address of the JobService), and <code>environment_type</code> set to <code>LOOPBACK</code>. For example:</li></ol></p><div class='language-py snippet'><div class="notebook-skip code-snippet"><a class=copy type=button data-bs-toggle=tooltip data-bs-placement=bottom title="Copy to clipboard"><img src=/images/copy-icon.svg></a><div class=highlight><pre tabindex=0 class=chroma><code class=language-py data-lang=py><span class=line><span class=cl><span class=kn>import</span> <span class=nn>apache_beam</span> <span class=k>as</span> <span class=nn>beam</span>
</span></span><span class=line><span class=cl><span class=kn>from</span> <span class=nn>apache_beam.options.pipeline_options</span> <span class=kn>import</span> <span class=n>PipelineOptions</span>
</span></span><span class=line><span class=cl>
</span></span><span class=line><span class=cl><span class=n>options</span> <span class=o>=</span> <span class=n>PipelineOptions</span><span class=p>([</span>
</span></span><span class=line><span class=cl> <span class=s2>&#34;--runner=PortableRunner&#34;</span><span class=p>,</span>
</span></span><span class=line><span class=cl> <span class=s2>&#34;--job_endpoint=localhost:8099&#34;</span><span class=p>,</span>
</span></span><span class=line><span class=cl> <span class=s2>&#34;--environment_type=LOOPBACK&#34;</span>
</span></span><span class=line><span class=cl><span class=p>])</span>
</span></span><span class=line><span class=cl><span class=k>with</span> <span class=n>beam</span><span class=o>.</span><span class=n>Pipeline</span><span class=p>(</span><span class=n>options</span><span class=p>)</span> <span class=k>as</span> <span class=n>p</span><span class=p>:</span>
</span></span><span class=line><span class=cl> <span class=o>...</span></span></span></code></pre></div></div></div><h3 id=running-on-a-pre-deployed-spark-cluster>Running on a pre-deployed Spark cluster</h3><p>Deploying your Beam pipeline on a cluster that already has a Spark deployment (Spark classes are available in container classpath) does not require any additional dependencies.
For more details on the different deployment modes see: <a href=https://spark.apache.org/docs/latest/spark-standalone.html>Standalone</a>, <a href=https://spark.apache.org/docs/latest/running-on-yarn.html>YARN</a>, or <a href=https://spark.apache.org/docs/latest/running-on-mesos.html>Mesos</a>.</p><p class=language-py><ol><li>Start a Spark cluster which exposes the master on port 7077 by default.</li></ol></p><p class=language-py><ol start=2><li>Start JobService that will connect with the Spark master:<ul><li>with Docker (preferred): <code>docker run --net=host apache/beam_spark_job_server:latest --spark-master-url=spark://localhost:7077</code></li><li>or from Beam source code: <code>./gradlew :runners:spark:3:job-server:runShadow -PsparkMasterUrl=spark://localhost:7077</code></li></ul></li></ol></p><p class=language-py><ol start=3><li>Submit the pipeline as above.
Note however that <code>environment_type=LOOPBACK</code> is only intended for local testing.
See <a href=/roadmap/portability/#sdk-harness-config>here</a> for details.</li></ol></p><p class=language-py>(Note that, depending on your cluster setup, you may need to change the <code>environment_type</code> option.
See <a href=/roadmap/portability/#sdk-harness-config>here</a> for details.)</p><h3 id=running-on-dataproc-cluster-yarn-backed>Running on Dataproc cluster (YARN backed)</h3><p>To run Beam jobs written in Python, Go, and other supported languages, you can use the <code>SparkRunner</code> and <code>PortableRunner</code> as described on the Beam&rsquo;s <a href=https://beam.apache.org/documentation/runners/spark/>Spark Runner</a> page (also see <a href=https://beam.apache.org/roadmap/portability/>Portability Framework Roadmap</a>).</p><p>The following example runs a portable Beam job in Python from the Dataproc cluster&rsquo;s master node with Yarn backed.</p><blockquote><p>Note: This example executes successfully with Dataproc 2.0, Spark 3.1.2 and Beam 2.37.0.</p></blockquote><ol><li>Create a Dataproc cluster with <a href=https://cloud.google.com/dataproc/docs/concepts/components/docker>Docker</a> component enabled.</li></ol><pre>
gcloud dataproc clusters create <b><i>CLUSTER_NAME</i></b> \
--optional-components=DOCKER \
--image-version=<b><i>DATAPROC_IMAGE_VERSION</i></b> \
--region=<b><i>REGION</i></b> \
--enable-component-gateway \
--scopes=https://www.googleapis.com/auth/cloud-platform \
--properties spark:spark.master.rest.enabled=true
</pre><ul><li><code>--optional-components</code>: Docker.</li><li><code>--image-version</code>: the <a href=https://cloud.google.com/dataproc/docs/concepts/versioning/dataproc-versions#supported_cloud_dataproc_versions>cluster&rsquo;s image version</a>, which determines the Spark version installed on the cluster (for example, see the Apache Spark component versions listed for the latest and previous four <a href=https://cloud.google.com/dataproc/docs/concepts/versioning/dataproc-release-2.0>2.0.x image release versions</a>).</li><li><code>--region</code>: a supported Dataproc <a href=https://cloud.google.com/dataproc/docs/concepts/regional-endpoints#regional_endpoint_semantics>region</a>.</li><li><code>--enable-component-gateway</code>: enable access to <a href=https://cloud.google.com/dataproc/docs/concepts/accessing/dataproc-gateways>web interfaces</a>.</li><li><code>--scopes</code>: enable API access to GCP services in the same project.</li><li><code>--properties</code>: add specific configuration for some component, here spark.master.rest is enabled to use job submit to the cluster.</li></ul><ol start=2><li>Create a Cloud Storage bucket.</li></ol><pre>
gsutil mb <b><i>BUCKET_NAME</i></b>
</pre><ol start=3><li>Install the necessary Python libraries for the job in your local environment.</li></ol><pre>
python -m pip install apache-beam[gcp]==<b><i>BEAM_VERSION</i></b>
</pre><ol start=4><li>Bundle the word count example pipeline along with all dependencies, artifacts, etc. required to run the pipeline into a jar that can be executed later.</li></ol><pre>
python -m apache_beam.examples.wordcount \
--runner=SparkRunner \
--output_executable_path=<b><i>OUTPUT_JAR_PATH</b></i> \
--output=gs://<b><i>BUCKET_NAME</i></b>/python-wordcount-out \
--spark_version=3
</pre><ul><li><code>--runner</code>(required): <code>SparkRunner</code>.</li><li><code>--output_executable_path</code>(required): path for the bundle jar to be created.</li><li><code>--output</code>(required): where output shall be written.</li><li><code>--spark_version</code>(optional): select spark version 3 (default) or 2 (deprecated!).</li></ul><ol start=5><li>Submit spark job to Dataproc cluster&rsquo;s master node.</li></ol><pre>
gcloud dataproc jobs submit spark \
--cluster=<b><i>CLUSTER_NAME</i></b> \
--region=<b><i>REGION</i></b> \
--class=org.apache.beam.runners.spark.SparkPipelineRunner \
--jars=<b><i>OUTPUT_JAR_PATH</b></i>
</pre><ul><li><code>--cluster</code>: name of created Dataproc cluster.</li><li><code>--region</code>: a supported Dataproc <a href=https://cloud.google.com/dataproc/docs/concepts/regional-endpoints#regional_endpoint_semantics>region</a>.</li><li><code>--class</code>: the entry point for your application.</li><li><code>--jars</code>: path to the bundled jar including your application and all dependencies.</li></ul><ol start=6><li>Check that the results were written to your bucket.</li></ol><pre>
gsutil cat gs://<b><i>BUCKET_NAME</b></i>/python-wordcount-out-<b><i>SHARD_ID</b></i>
</pre><h2 id=pipeline-options-for-the-spark-runner>Pipeline options for the Spark Runner</h2><p>When executing your pipeline with the Spark Runner, you should consider the following pipeline options.</p><p class=language-java><br><b>For RDD/DStream based runner:</b><br></p><div class=table-container-wrapper><table class="language-java table table-bordered"><tr><th>Field</th><th>Description</th><th>Default Value</th></tr><tr><td><code>runner</code></td><td>The pipeline runner to use. This option allows you to determine the pipeline runner at runtime.</td><td>Set to <code>SparkRunner</code> to run using Spark.</td></tr><tr><td><code>sparkMaster</code></td><td>The url of the Spark Master. This is the equivalent of setting <code>SparkConf#setMaster(String)</code> and can either be <code>local[x]</code> to run local with x cores, <code>spark://host:port</code> to connect to a Spark Standalone cluster, <code>mesos://host:port</code> to connect to a Mesos cluster, or <code>yarn</code> to connect to a yarn cluster.</td><td><code>local[4]</code></td></tr><tr><td><code>storageLevel</code></td><td>The <code>StorageLevel</code> to use when caching RDDs in batch pipelines. The Spark Runner automatically caches RDDs that are evaluated repeatedly. This is a batch-only property as streaming pipelines in Beam are stateful, which requires Spark DStream's <code>StorageLevel</code> to be <code>MEMORY_ONLY</code>.</td><td>MEMORY_ONLY</td></tr><tr><td><code>batchIntervalMillis</code></td><td>The <code>StreamingContext</code>'s <code>batchDuration</code> - setting Spark's batch interval.</td><td><code>1000</code></td></tr><tr><td><code>enableSparkMetricSinks</code></td><td>Enable reporting metrics to Spark's metrics Sinks.</td><td>true</td></tr><tr><td><code>cacheDisabled</code></td><td>Disable caching of reused PCollections for whole Pipeline. It's useful when it's faster to recompute RDD rather than save.</td><td>false</td></tr></table></div><p class=language-java><br><b>For Structured Streaming based runner:</b><br></p><div class=table-container-wrapper><table class="language-java table table-bordered"><tr><th>Field</th><th>Description</th><th>Default Value</th></tr><tr><td><code>runner</code></td><td>The pipeline runner to use. This option allows you to determine the pipeline runner at runtime.</td><td>Set to <code>SparkStructuredStreamingRunner</code> to run using Spark Structured Streaming.</td></tr><tr><td><code>sparkMaster</code></td><td>The url of the Spark Master. This is the equivalent of setting <code>SparkConf#setMaster(String)</code> and can either be <code>local[x]</code> to run local with x cores, <code>spark://host:port</code> to connect to a Spark Standalone cluster, <code>mesos://host:port</code> to connect to a Mesos cluster, or <code>yarn</code> to connect to a yarn cluster.</td><td><code>local[4]</code></td></tr><tr><td><code>testMode</code></td><td>Enable test mode that gives useful debugging information: catalyst execution plans and Beam DAG printing</td><td>false</td></tr><tr><td><code>enableSparkMetricSinks</code></td><td>Enable reporting metrics to Spark's metrics Sinks.</td><td>true</td></tr><tr><td><code>checkpointDir</code></td><td>A checkpoint directory for streaming resilience, ignored in batch. For durability, a reliable filesystem such as HDFS/S3/GS is necessary.</td><td>local dir in /tmp</td></tr><tr><td><code>filesToStage</code></td><td>Jar-Files to send to all workers and put on the classpath.</td><td>all files from the classpath</td></tr><tr><td><code>EnableSparkMetricSinks</code></td><td>Enable/disable sending aggregator values to Spark's metric sinks</td><td>true</td></tr></table></div><div class=table-container-wrapper><table class="language-py table table-bordered"><tr><th>Field</th><th>Description</th><th>Value</th></tr><tr><td><code>--runner</code></td><td>The pipeline runner to use. This option allows you to determine the pipeline runner at runtime.</td><td>Set to <code>PortableRunner</code> to run using Spark.</td></tr><tr><td><code>--job_endpoint</code></td><td>Job service endpoint to use. Should be in the form hostname:port, e.g. localhost:3000</td><td>Set to match your job service endpoint (localhost:8099 by default)</td></tr></table></div><h2 id=additional-notes>Additional notes</h2><h3 id=using-spark-submit>Using spark-submit</h3><p>When submitting a Spark application to cluster, it is common (and recommended) to use the <code>spark-submit</code> script that is provided with the spark installation.
The <code>PipelineOptions</code> described above are not to replace <code>spark-submit</code>, but to complement it.
Passing any of the above mentioned options could be done as one of the <code>application-arguments</code>, and setting <code>&ndash;master</code> takes precedence.
For more on how to generally use <code>spark-submit</code> checkout Spark <a href=https://spark.apache.org/docs/latest/submitting-applications.html#launching-applications-with-spark-submit>documentation</a>.</p><h3 id=monitoring-your-job>Monitoring your job</h3><p>You can monitor a running Spark job using the Spark <a href=https://spark.apache.org/docs/latest/monitoring.html#web-interfaces>Web Interfaces</a>. By default, this is available at port <code>4040</code> on the driver node. If you run Spark on your local machine that would be <code>http://localhost:4040</code>.
Spark also has a history server to <a href=https://spark.apache.org/docs/latest/monitoring.html#viewing-after-the-fact>view after the fact</a>.<p class=language-java>Metrics are also available via <a href=https://spark.apache.org/docs/latest/monitoring.html#rest-api>REST API</a>.
Spark provides a <a href=https://spark.apache.org/docs/latest/monitoring.html#metrics>metrics system</a> that allows reporting Spark metrics to a variety of Sinks.
The Spark runner reports user-defined Beam Aggregators using this same metrics system and currently supports
<a href=https://beam.apache.org/releases/javadoc/2.56.0/org/apache/beam/runners/spark/metrics/sink/GraphiteSink.html>GraphiteSink</a>
and <a href=https://beam.apache.org/releases/javadoc/2.56.0/org/apache/beam/runners/spark/metrics/sink/CsvSink.html>CSVSink</a>.
Providing support for additional Sinks supported by Spark is easy and straight-forward.</p><p class=language-py>Spark metrics are not yet supported on the portable runner.</p></p><h3 id=streaming-execution>Streaming Execution</h3><p class=language-java><br><b>For RDD/DStream based runner:</b><br>If your pipeline uses an <code>UnboundedSource</code> the Spark Runner will automatically set streaming mode. Forcing streaming mode is mostly used for testing and is not recommended.<br><br><b>For Structured Streaming based runner:</b><br>Streaming mode is not implemented yet in the Spark Structured Streaming runner.</p><p class=language-py>Streaming is not yet supported on the Spark portable runner.</p><h3 id=using-a-provided-sparkcontext-and-streaminglisteners>Using a provided SparkContext and StreamingListeners</h3><p class=language-java><br><b>For RDD/DStream based runner:</b><br>If you would like to execute your Spark job with a provided <code>SparkContext</code>, such as when using the <a href=https://github.com/spark-jobserver/spark-jobserver>spark-jobserver</a>, or use <code>StreamingListeners</code>, you can&rsquo;t use <code>SparkPipelineOptions</code> (the context or a listener cannot be passed as a command-line argument anyway).
Instead, you should use <code>SparkContextOptions</code> which can only be used programmatically and is not a common <code>PipelineOptions</code> implementation.<br><br><b>For Structured Streaming based runner:</b><br>Provided SparkSession and StreamingListeners are not supported on the Spark Structured Streaming runner</p><p class=language-py>Provided SparkContext and StreamingListeners are not supported on the Spark portable runner.</p><h3 id=kubernetes>Kubernetes</h3><h4 id=submit-beam-job-without-job-server>Submit beam job without job server</h4><p>To submit a beam job directly on spark kubernetes cluster without spinning up an extra job server, you can do:</p><pre tabindex=0><code>spark-submit --master MASTER_URL \
--conf spark.kubernetes.driver.podTemplateFile=driver_pod_template.yaml \
--conf spark.kubernetes.executor.podTemplateFile=executor_pod_template.yaml \
--class org.apache.beam.runners.spark.SparkPipelineRunner \
--conf spark.kubernetes.container.image=apache/spark:v3.3.2 \
./wc_job.jar
</code></pre><p>Similar to run the beam job on Dataproc, you can bundle the job jar like below. The example use the <code>PROCESS</code> type of <a href=https://beam.apache.org/documentation/runtime/sdk-harness-config/>SDK harness</a> to execute the job by processes.</p><pre tabindex=0><code>python -m beam_example_wc \
--runner=SparkRunner \
--output_executable_path=./wc_job.jar \
--environment_type=PROCESS \
--environment_config=&#39;{\&#34;command\&#34;: \&#34;/opt/apache/beam/boot\&#34;}&#39; \
--spark_version=3
</code></pre><p>And below is an example of kubernetes executor pod template, the <code>initContainer</code> is required to download the beam SDK harness to run the beam pipelines.</p><pre tabindex=0><code>spec:
containers:
- name: spark-kubernetes-executor
volumeMounts:
- name: beam-data
mountPath: /opt/apache/beam/
initContainers:
- name: init-beam
image: apache/beam_python3.7_sdk
command:
- cp
- /opt/apache/beam/boot
- /init-container/data/boot
volumeMounts:
- name: beam-data
mountPath: /init-container/data
volumes:
- name: beam-data
emptyDir: {}
</code></pre><h4 id=submit-beam-job-with-job-server>Submit beam job with job server</h4><p>An <a href=https://github.com/cometta/python-apache-beam-spark>example</a> of configuring Spark to run Apache beam job with a job server.</p><div class=feedback><p class=update>Last updated on 2024/05/07</p><h3>Have you found everything you were looking for?</h3><p class=description>Was it all useful and clear? Is there anything that you would like to change? Let us know!</p><button class=load-button><a href="https://docs.google.com/forms/d/e/1FAIpQLSfID7abne3GE6k6RdJIyZhPz2Gef7UkpggUEhTIDjjplHuxSA/viewform?usp=header_link" target=_blank>SEND FEEDBACK</a></button></div></div></div><footer class=footer><div class=footer__contained><div class=footer__cols><div class="footer__cols__col footer__cols__col__logos"><div class=footer__cols__col__logo><img src=/images/beam_logo_circle.svg class=footer__logo alt="Beam logo"></div><div class=footer__cols__col__logo><img src=/images/apache_logo_circle.svg class=footer__logo alt="Apache logo"></div></div><div class=footer-wrapper><div class=wrapper-grid><div class=footer__cols__col><div class=footer__cols__col__title>Start</div><div class=footer__cols__col__link><a href=/get-started/beam-overview/>Overview</a></div><div class=footer__cols__col__link><a href=/get-started/quickstart-java/>Quickstart (Java)</a></div><div class=footer__cols__col__link><a href=/get-started/quickstart-py/>Quickstart (Python)</a></div><div class=footer__cols__col__link><a href=/get-started/quickstart-go/>Quickstart (Go)</a></div><div class=footer__cols__col__link><a href=/get-started/downloads/>Downloads</a></div></div><div class=footer__cols__col><div class=footer__cols__col__title>Docs</div><div class=footer__cols__col__link><a href=/documentation/programming-guide/>Concepts</a></div><div class=footer__cols__col__link><a href=/documentation/pipelines/design-your-pipeline/>Pipelines</a></div><div class=footer__cols__col__link><a href=/documentation/runners/capability-matrix/>Runners</a></div></div><div class=footer__cols__col><div class=footer__cols__col__title>Community</div><div class=footer__cols__col__link><a href=/contribute/>Contribute</a></div><div class=footer__cols__col__link><a href=https://projects.apache.org/committee.html?beam target=_blank>Team<img src=/images/external-link-icon.png width=14 height=14 alt="External link."></a></div><div class=footer__cols__col__link><a href=/community/presentation-materials/>Media</a></div><div class=footer__cols__col__link><a href=/community/in-person/>Events/Meetups</a></div><div class=footer__cols__col__link><a href=/community/contact-us/>Contact Us</a></div></div><div class=footer__cols__col><div class=footer__cols__col__title>Resources</div><div class=footer__cols__col__link><a href=/blog/>Blog</a></div><div class=footer__cols__col__link><a href=https://github.com/apache/beam>GitHub</a></div></div></div><div class=footer__bottom>&copy;
<a href=https://www.apache.org>The Apache Software Foundation</a>
| <a href=/privacy_policy>Privacy Policy</a>
| <a href=/feed.xml>RSS Feed</a><br><br>Apache Beam, Apache, Beam, the Beam logo, and the Apache feather logo are either registered trademarks or trademarks of The Apache Software Foundation. All other products or name brands are trademarks of their respective holders, including The Apache Software Foundation.</div></div><div class="footer__cols__col footer__cols__col__logos"><div class=footer__cols__col--group><div class=footer__cols__col__logo><a href=https://github.com/apache/beam><img src=/images/logos/social-icons/github-logo-150.png class=footer__logo alt="Github logo"></a></div><div class=footer__cols__col__logo><a href=https://www.linkedin.com/company/apache-beam/><img src=/images/logos/social-icons/linkedin-logo-150.png class=footer__logo alt="Linkedin logo"></a></div></div><div class=footer__cols__col--group><div class=footer__cols__col__logo><a href=https://twitter.com/apachebeam><img src=/images/logos/social-icons/twitter-logo-150.png class=footer__logo alt="Twitter logo"></a></div><div class=footer__cols__col__logo><a href=https://www.youtube.com/channel/UChNnb_YO_7B0HlW6FhAXZZQ><img src=/images/logos/social-icons/youtube-logo-150.png class=footer__logo alt="Youtube logo"></a></div></div></div></div></div></footer></body></html>