blob: 5f5eb3b932e68d80bb823587b2727a6cdbcded3f [file] [log] [blame]
<!doctype html><html lang=en class=no-js><head><meta charset=utf-8><meta http-equiv=x-ua-compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1"><title>Cloud Dataflow Runner</title><meta name=description content="Apache Beam is an open source, unified model and set of language-specific SDKs for defining and executing data processing workflows, and also data ingestion and integration flows, supporting Enterprise Integration Patterns (EIPs) and Domain Specific Languages (DSLs). Dataflow pipelines simplify the mechanics of large-scale batch and streaming data processing and can run on a number of runtimes like Apache Flink, Apache Spark, and Google Cloud Dataflow (a cloud service). Beam also brings DSL in different languages, allowing users to easily implement their data integration processes."><link href="https://fonts.googleapis.com/css?family=Roboto:100,300,400,500,700" rel=stylesheet><link rel=preload href=/scss/main.min.d653ded46cd5f19a535cb20567fce9699849fe46f950d91ac6bf336db8ff8724.css as=style><link href=/scss/main.min.d653ded46cd5f19a535cb20567fce9699849fe46f950d91ac6bf336db8ff8724.css rel=stylesheet integrity><script src=https://code.jquery.com/jquery-2.2.4.min.js></script><style>.body__contained img{max-width:100%}</style><script type=text/javascript src=/js/bootstrap.min.2979f9a6e32fc42c3e7406339ee9fe76b31d1b52059776a02b4a7fa6a4fd280a.js defer></script>
<script type=text/javascript src=/js/language-switch-v2.min.121952b7980b920320ab229551857669209945e39b05ba2b433a565385ca44c6.js defer></script>
<script type=text/javascript src=/js/fix-menu.min.039174b67107465f2090a493f91e126f7aa797f29420f9edab8a54d9dd4b3d2d.js defer></script>
<script type=text/javascript src=/js/section-nav.min.1405fd5e70fab5f6c54037c269b1d137487d8f3d1b3009032525f6db3fbce991.js defer></script>
<script type=text/javascript src=/js/page-nav.min.af231204c9c52c5089d53a4c02739eacbb7f939e3be1c6ffcc212e0ac4dbf879.js defer></script>
<script type=text/javascript src=/js/expandable-list.min.75a4526624a3b8898fe7fb9e3428c205b581f8b38c7926922467aef17eac69f2.js defer></script>
<script type=text/javascript src=/js/copy-to-clipboard.min.364c06423d7e8993fc42bb4abc38c03195bc8386db26d18774ce775d08d5b18d.js defer></script>
<script type=text/javascript src=/js/calendar.min.336664054fa0f52b08bbd4e3c59b5cb6d63dcfb2b4d602839746516b0817446b.js defer></script>
<script type=text/javascript src=/js/fix-playground-nested-scroll.min.0283f1037cb1b9d5074c6eaf041292b524a8148a7cdb803d5ccd6d1fc4eb3253.js defer></script>
<script type=text/javascript src=/js/anchor-content-jump-fix.min.22d3240f81632e4c11179b9d2aaf37a40da9414333c43aa97344e8b21a7df0e4.js defer></script>
<link rel=alternate type=application/rss+xml title="Apache Beam" href=/feed.xml><link rel=canonical href=/documentation/runners/dataflow/ data-proofer-ignore><link rel="shortcut icon" type=image/x-icon href=/images/favicon.ico><link rel=stylesheet href=https://use.fontawesome.com/releases/v5.4.1/css/all.css integrity=sha384-5sAR7xN1Nv6T6+dT2mhtzEpVJvfS3NScPQTrOxhwjIuvcA67KV2R5Jz6kr4abQsz crossorigin=anonymous><link rel=stylesheet href=https://unpkg.com/swiper@8/swiper-bundle.min.css><script async src=https://platform.twitter.com/widgets.js></script>
<script>(function(e,t,n,s,o,i,a){e.GoogleAnalyticsObject=o,e[o]=e[o]||function(){(e[o].q=e[o].q||[]).push(arguments)},e[o].l=1*new Date,i=t.createElement(n),a=t.getElementsByTagName(n)[0],i.async=1,i.src=s,a.parentNode.insertBefore(i,a)})(window,document,"script","//www.google-analytics.com/analytics.js","ga"),ga("create","UA-73650088-1","auto"),ga("send","pageview")</script><script>(function(e,t,n,s,o,i){e.hj=e.hj||function(){(e.hj.q=e.hj.q||[]).push(arguments)},e._hjSettings={hjid:2182187,hjsv:6},o=t.getElementsByTagName("head")[0],i=t.createElement("script"),i.async=1,i.src=n+e._hjSettings.hjid+s+e._hjSettings.hjsv,o.appendChild(i)})(window,document,"https://static.hotjar.com/c/hotjar-",".js?sv=")</script></head><body class=body data-spy=scroll data-target=.page-nav data-offset=0><nav class="navigation-bar-mobile header navbar navbar-fixed-top"><div class=navbar-header><a href=/ class=navbar-brand><img alt=Brand style=height:46px;width:43px src=/images/beam_logo_navbar_mobile.png></a>
<a class=navbar-link href=/get-started/>Get Started</a>
<a class=navbar-link href=/documentation/>Documentation</a>
<button type=button class="navbar-toggle menu-open" aria-expanded=false aria-controls=navbar onclick=openMenu()>
<span class=sr-only>Toggle navigation</span>
<span class=icon-bar></span>
<span class=icon-bar></span>
<span class=icon-bar></span></button></div><div class="navbar-mask closed"></div><div id=navbar class="navbar-container closed"><button type=button class=navbar-toggle aria-expanded=false aria-controls=navbar id=closeMenu>
<span class=sr-only>Toggle navigation</span>
<span class=icon-bar></span>
<span class=icon-bar></span>
<span class=icon-bar></span></button><ul class="nav navbar-nav"><li><div class=searchBar-mobile><script>(function(){var t,n="012923275103528129024:4emlchv9wzi",e=document.createElement("script");e.type="text/javascript",e.async=!0,e.src="https://cse.google.com/cse.js?cx="+n,t=document.getElementsByTagName("script")[0],t.parentNode.insertBefore(e,t)})()</script><gcse:search></gcse:search></div></li><li><a class=navbar-link href=/about>About</a></li><li><a class=navbar-link href=/get-started/>Get Started</a></li><li><span class=navbar-link>Documentation</span><ul><li><a href=/documentation/>General</a></li><li><a href=/documentation/sdks/java/>Languages</a></li><li><a href=/documentation/runners/capability-matrix/>Runners</a></li><li><a href=/documentation/io/connectors/>I/O Connectors</a></li></ul></li><li><a class=navbar-link href=/roadmap/>Roadmap</a></li><li><a class=navbar-link href=/community/>Community</a></li><li><a class=navbar-link href=/contribute/>Contribute</a></li><li><a class=navbar-link href=/blog/>Blog</a></li><li><a class=navbar-link href=/case-studies/>Case Studies</a></li></ul><ul class="nav navbar-nav navbar-right"><li><a href=https://github.com/apache/beam/edit/master/website/www/site/content/en/documentation/runners/dataflow.md data-proofer-ignore><svg xmlns="http://www.w3.org/2000/svg" width="25" height="24" fill="none" viewBox="0 0 25 24"><path stroke="#ff6d00" stroke-linecap="round" stroke-linejoin="round" stroke-width="2.75" d="M4.543 20h4l10.5-10.5c.53-.53.828-1.25.828-2s-.298-1.47-.828-2-1.25-.828-2-.828-1.47.298-2 .828L4.543 16v4zm9.5-13.5 4 4"/></svg></a></li><li class=dropdown><a href=# class=dropdown-toggle id=apache-dropdown data-toggle=dropdown role=button aria-haspopup=true aria-expanded=false><img src=https://www.apache.org/foundation/press/kit/feather_small.png alt="Apache Logo" style=height:20px>
&nbsp;Apache
<span class=arrow-icon><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" fill="none" viewBox="0 0 20 20"><circle cx="10" cy="10" r="10" fill="#ff6d00"/><path stroke="#fff" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M8.535 5.28l4.573 4.818-4.573 4.403"/></svg></span></a><ul class="dropdown-menu dropdown-menu-right"><li><a target=_blank href=https://www.apache.org/>ASF Homepage</a></li><li><a target=_blank href=https://www.apache.org/licenses/>License</a></li><li><a target=_blank href=https://www.apache.org/security/>Security</a></li><li><a target=_blank href=https://www.apache.org/foundation/thanks.html>Thanks</a></li><li><a target=_blank href=https://www.apache.org/foundation/sponsorship.html>Sponsorship</a></li><li><a target=_blank href=https://www.apache.org/foundation/policies/conduct>Code of Conduct</a></li></ul></li></ul></div></nav><nav class=navigation-bar-desktop><a href=/ class=navbar-logo><img src=/images/beam_logo_navbar.png alt="Beam Logo"></a><div class=navbar-bar-left><div class=navbar-links><a class=navbar-link href=/about>About</a>
<a class=navbar-link href=/get-started/>Get Started</a><li class="dropdown navbar-dropdown navbar-dropdown-documentation"><a href=# class="dropdown-toggle navbar-link" role=button aria-haspopup=true aria-expanded=false>Documentation
<span><svg xmlns="http://www.w3.org/2000/svg" width="12" height="11" fill="none" viewBox="0 0 12 11"><path stroke="#ff6d00" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10.666 4.535 5.847 9.108 1.444 4.535"/></svg></span></a><ul class=dropdown-menu><li><a class=navbar-dropdown-menu-link href=/documentation/>General</a></li><li><a class=navbar-dropdown-menu-link href=/documentation/sdks/java/>Languages</a></li><li><a class=navbar-dropdown-menu-link href=/documentation/runners/capability-matrix/>Runners</a></li><li><a class=navbar-dropdown-menu-link href=/documentation/io/connectors/>I/O Connectors</a></li></ul></li><a class=navbar-link href=/roadmap/>Roadmap</a>
<a class=navbar-link href=/community/>Community</a>
<a class=navbar-link href=/contribute/>Contribute</a>
<a class=navbar-link href=/blog/>Blog</a>
<a class=navbar-link href=/case-studies/>Case Studies</a></div><div id=iconsBar><a type=button onclick=showSearch()><svg xmlns="http://www.w3.org/2000/svg" width="25" height="24" fill="none" viewBox="0 0 25 24"><path stroke="#ff6d00" stroke-linecap="round" stroke-linejoin="round" stroke-width="2.75" d="M10.191 17c3.866.0 7-3.134 7-7s-3.134-7-7-7-7 3.134-7 7 3.134 7 7 7zm11 4-6-6"/></svg></a><a target=_blank href=https://github.com/apache/beam/edit/master/website/www/site/content/en/documentation/runners/dataflow.md data-proofer-ignore><svg xmlns="http://www.w3.org/2000/svg" width="25" height="24" fill="none" viewBox="0 0 25 24"><path stroke="#ff6d00" stroke-linecap="round" stroke-linejoin="round" stroke-width="2.75" d="M4.543 20h4l10.5-10.5c.53-.53.828-1.25.828-2s-.298-1.47-.828-2-1.25-.828-2-.828-1.47.298-2 .828L4.543 16v4zm9.5-13.5 4 4"/></svg></a><li class="dropdown navbar-dropdown navbar-dropdown-apache"><a href=# class=dropdown-toggle role=button aria-haspopup=true aria-expanded=false><img src=https://www.apache.org/foundation/press/kit/feather_small.png alt="Apache Logo" style=height:20px>
&nbsp;Apache
<span class=arrow-icon><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" fill="none" viewBox="0 0 20 20"><circle cx="10" cy="10" r="10" fill="#ff6d00"/><path stroke="#fff" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M8.535 5.28l4.573 4.818-4.573 4.403"/></svg></span></a><ul class=dropdown-menu><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/>ASF Homepage</a></li><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/licenses/>License</a></li><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/security/>Security</a></li><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/foundation/thanks.html>Thanks</a></li><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/foundation/sponsorship.html>Sponsorship</a></li><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/foundation/policies/conduct>Code of Conduct</a></li></ul></li></div><div class="searchBar disappear"><script>(function(){var t,n="012923275103528129024:4emlchv9wzi",e=document.createElement("script");e.type="text/javascript",e.async=!0,e.src="https://cse.google.com/cse.js?cx="+n,t=document.getElementsByTagName("script")[0],t.parentNode.insertBefore(e,t)})()</script><gcse:search></gcse:search>
<a type=button onclick=endSearch()><svg xmlns="http://www.w3.org/2000/svg" width="25" height="25" fill="none" viewBox="0 0 25 25"><path stroke="#ff6d00" stroke-linecap="round" stroke-linejoin="round" stroke-width="2.75" d="M21.122 20.827 4.727 4.432M21.122 4.43 4.727 20.827"/></svg></a></div></div></nav><div class=header-push></div><div class="top-banners swiper"><div class=swiper-wrapper><div class=swiper-slide><a href=https://tour.beam.apache.org><img class=banner-img-desktop src=/images/banners/tour-of-beam/tour-of-beam-desktop.png alt="Start Tour of Beam">
<img class=banner-img-mobile src=/images/banners/tour-of-beam/tour-of-beam-mobile.png alt="Start Tour of Beam"></a></div><div class=swiper-slide><a href=https://beam.apache.org/documentation/ml/overview/><img class=banner-img-desktop src=/images/banners/machine-learning/machine-learning-desktop.jpg alt="Machine Learning">
<img class=banner-img-mobile src=/images/banners/machine-learning/machine-learning-mobile.jpg alt="Machine Learning"></a></div></div><div class=swiper-pagination></div><div class=swiper-button-prev></div><div class=swiper-button-next></div></div><script src=/js/swiper-bundle.min.min.e0e8f81b0b15728d35ff73c07f42ddbb17a108d6f23df4953cb3e60df7ade675.js></script>
<script src=/js/sliders/top-banners.min.afa7d0a19acf7a3b28ca369490b3d401a619562a2a4c9612577be2f66a4b9855.js></script>
<script>function showSearch(){addPlaceholder();var e,t=document.querySelector(".searchBar");t.classList.remove("disappear"),e=document.querySelector("#iconsBar"),e.classList.add("disappear")}function addPlaceholder(){$("input:text").attr("placeholder","What are you looking for?")}function endSearch(){var e,t=document.querySelector(".searchBar");t.classList.add("disappear"),e=document.querySelector("#iconsBar"),e.classList.remove("disappear")}function blockScroll(){$("body").toggleClass("fixedPosition")}function openMenu(){addPlaceholder(),blockScroll()}</script><div class="clearfix container-main-content"><div class="section-nav closed" data-offset-top=90 data-offset-bottom=500><span class="section-nav-back glyphicon glyphicon-menu-left"></span><nav><ul class=section-nav-list data-section-nav><li><span class=section-nav-list-main-title>Runners</span></li><li><a href=/documentation/runners/capability-matrix/>Capability Matrix</a></li><li><a href=/documentation/runners/direct/>Direct Runner</a></li><li><a href=/documentation/runners/flink/>Apache Flink</a></li><li><a href=/documentation/runners/nemo/>Apache Nemo</a></li><li><a href=/documentation/runners/samza/>Apache Samza</a></li><li><a href=/documentation/runners/spark/>Apache Spark</a></li><li><a href=/documentation/runners/dataflow/>Google Cloud Dataflow</a></li><li><a href=/documentation/runners/jet/>Hazelcast Jet</a></li><li><a href=/documentation/runners/twister2/>Twister2</a></li></ul></nav></div><nav class="page-nav clearfix" data-offset-top=90 data-offset-bottom=500><nav id=TableOfContents><ul><li><a href=#setup>Cloud Dataflow Runner prerequisites and setup</a><ul><li><a href=#dependency>Specify your dependency</a></li><li><a href=#self-executing-jar>Self executing JAR</a></li></ul></li><li><a href=#pipeline-options>Pipeline options for the Cloud Dataflow Runner</a></li><li><a href=#additional-info>Additional information and caveats</a><ul><li><a href=#monitoring>Monitoring your job</a></li><li><a href=#blocking-execution>Blocking Execution</a></li><li><a href=#streaming-execution>Streaming Execution</a></li></ul></li></ul></nav></nav><div class="body__contained body__section-nav"><h1 id=using-the-google-cloud-dataflow-runner>Using the Google Cloud Dataflow Runner</h1><nav class=language-switcher><strong>Adapt for:</strong><ul><li data-value=java class=active>Java SDK</li><li data-value=py>Python SDK</li></ul></nav><p>The Google Cloud Dataflow Runner uses the <a href=https://cloud.google.com/dataflow/service/dataflow-service-desc>Cloud Dataflow managed service</a>. When you run your pipeline with the Cloud Dataflow service, the runner uploads your executable code and dependencies to a Google Cloud Storage bucket and creates a Cloud Dataflow job, which executes your pipeline on managed resources in Google Cloud Platform.</p><p>The Cloud Dataflow Runner and service are suitable for large scale, continuous jobs, and provide:</p><ul><li>a fully managed service</li><li><a href=https://cloud.google.com/dataflow/service/dataflow-service-desc#autoscaling>autoscaling</a> of the number of workers throughout the lifetime of the job</li><li><a href=https://cloud.google.com/blog/products/gcp/no-shard-left-behind-dynamic-work-rebalancing-in-google-cloud-dataflow>dynamic work rebalancing</a></li></ul><p>The <a href=/documentation/runners/capability-matrix/>Beam Capability Matrix</a> documents the supported capabilities of the Cloud Dataflow Runner.</p><h2 id=setup>Cloud Dataflow Runner prerequisites and setup</h2><p>To use the Cloud Dataflow Runner, you must complete the setup in the <em>Before you
begin</em> section of the <a href=https://cloud.google.com/dataflow/docs/quickstarts>Cloud Dataflow quickstart</a>
for your chosen language.</p><ol><li>Select or create a Google Cloud Platform Console project.</li><li>Enable billing for your project.</li><li>Enable the required Google Cloud APIs: Cloud Dataflow, Compute Engine,
Stackdriver Logging, Cloud Storage, Cloud Storage JSON, and Cloud Resource
Manager. You may need to enable additional APIs (such as BigQuery, Cloud
Pub/Sub, or Cloud Datastore) if you use them in your pipeline code.</li><li>Authenticate with Google Cloud Platform.</li><li>Install the Google Cloud SDK.</li><li>Create a Cloud Storage bucket.</li></ol><h3 id=dependency>Specify your dependency</h3><p><span class=language-java>When using Java, you must specify your dependency on the Cloud Dataflow Runner in your <code>pom.xml</code>.</span><div class='language-java snippet'><div class="notebook-skip code-snippet"><a class=copy type=button data-bs-toggle=tooltip data-bs-placement=bottom title="Copy to clipboard"><img src=/images/copy-icon.svg></a><div class=highlight><pre tabindex=0 class=chroma><code class=language-java data-lang=java><span class=line><span class=cl><span class=o>&lt;</span><span class=n>dependency</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>groupId</span><span class=o>&gt;</span><span class=n>org</span><span class=o>.</span><span class=na>apache</span><span class=o>.</span><span class=na>beam</span><span class=o>&lt;/</span><span class=n>groupId</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>artifactId</span><span class=o>&gt;</span><span class=n>beam</span><span class=o>-</span><span class=n>runners</span><span class=o>-</span><span class=n>google</span><span class=o>-</span><span class=n>cloud</span><span class=o>-</span><span class=n>dataflow</span><span class=o>-</span><span class=n>java</span><span class=o>&lt;/</span><span class=n>artifactId</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>version</span><span class=o>&gt;</span><span class=n>2</span><span class=o>.</span><span class=na>55</span><span class=o>.</span><span class=na>1</span><span class=o>&lt;/</span><span class=n>version</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>scope</span><span class=o>&gt;</span><span class=n>runtime</span><span class=o>&lt;/</span><span class=n>scope</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl><span class=o>&lt;/</span><span class=n>dependency</span><span class=o>&gt;</span></span></span></code></pre></div></div></div></p><p><span class=language-py>This section is not applicable to the Beam SDK for Python.</span></p><h3 id=self-executing-jar>Self executing JAR</h3><p class=language-py>This section is not applicable to the Beam SDK for Python.</p><p class=language-java>In some cases, such as starting a pipeline using a scheduler such as <a href=https://airflow.apache.org>Apache AirFlow</a>, you must have a self-contained application. You can pack a self-executing JAR by explicitly adding the following dependency on the Project section of your pom.xml, in addition to the adding existing dependency shown in the previous section.</p><div class='language-java snippet'><div class="notebook-skip code-snippet"><a class=copy type=button data-bs-toggle=tooltip data-bs-placement=bottom title="Copy to clipboard"><img src=/images/copy-icon.svg></a><div class=highlight><pre tabindex=0 class=chroma><code class=language-java data-lang=java><span class=line><span class=cl><span class=o>&lt;</span><span class=n>dependency</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>groupId</span><span class=o>&gt;</span><span class=n>org</span><span class=o>.</span><span class=na>apache</span><span class=o>.</span><span class=na>beam</span><span class=o>&lt;/</span><span class=n>groupId</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>artifactId</span><span class=o>&gt;</span><span class=n>beam</span><span class=o>-</span><span class=n>runners</span><span class=o>-</span><span class=n>google</span><span class=o>-</span><span class=n>cloud</span><span class=o>-</span><span class=n>dataflow</span><span class=o>-</span><span class=n>java</span><span class=o>&lt;/</span><span class=n>artifactId</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>version</span><span class=o>&gt;</span><span class=n>$</span><span class=o>{</span><span class=n>beam</span><span class=o>.</span><span class=na>version</span><span class=o>}&lt;/</span><span class=n>version</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>scope</span><span class=o>&gt;</span><span class=n>runtime</span><span class=o>&lt;/</span><span class=n>scope</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl><span class=o>&lt;/</span><span class=n>dependency</span><span class=o>&gt;</span></span></span></code></pre></div></div></div><p class=language-java>Then, add the mainClass name in the Maven JAR plugin.</p><div class='language-java snippet'><div class="notebook-skip code-snippet"><a class=copy type=button data-bs-toggle=tooltip data-bs-placement=bottom title="Copy to clipboard"><img src=/images/copy-icon.svg></a><div class=highlight><pre tabindex=0 class=chroma><code class=language-java data-lang=java><span class=line><span class=cl><span class=o>&lt;</span><span class=n>plugin</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>groupId</span><span class=o>&gt;</span><span class=n>org</span><span class=o>.</span><span class=na>apache</span><span class=o>.</span><span class=na>maven</span><span class=o>.</span><span class=na>plugins</span><span class=o>&lt;/</span><span class=n>groupId</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>artifactId</span><span class=o>&gt;</span><span class=n>maven</span><span class=o>-</span><span class=n>jar</span><span class=o>-</span><span class=n>plugin</span><span class=o>&lt;/</span><span class=n>artifactId</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>version</span><span class=o>&gt;</span><span class=n>$</span><span class=o>{</span><span class=n>maven</span><span class=o>-</span><span class=n>jar</span><span class=o>-</span><span class=n>plugin</span><span class=o>.</span><span class=na>version</span><span class=o>}&lt;/</span><span class=n>version</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>configuration</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>archive</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>manifest</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>addClasspath</span><span class=o>&gt;</span><span class=kc>true</span><span class=o>&lt;/</span><span class=n>addClasspath</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>classpathPrefix</span><span class=o>&gt;</span><span class=n>lib</span><span class=o>/&lt;/</span><span class=n>classpathPrefix</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;</span><span class=n>mainClass</span><span class=o>&gt;</span><span class=n>YOUR_MAIN_CLASS_NAME</span><span class=o>&lt;/</span><span class=n>mainClass</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;/</span><span class=n>manifest</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;/</span><span class=n>archive</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl> <span class=o>&lt;/</span><span class=n>configuration</span><span class=o>&gt;</span>
</span></span><span class=line><span class=cl><span class=o>&lt;/</span><span class=n>plugin</span><span class=o>&gt;</span></span></span></code></pre></div></div></div><p class=language-java>After running <code>mvn package -Pdataflow-runner</code>, run <code>ls target</code> and you should see (assuming your artifactId is <code>beam-examples</code> and the version is 1.0.0) the following output.</p><div class='language-java snippet'><div class="notebook-skip code-snippet"><a class=copy type=button data-bs-toggle=tooltip data-bs-placement=bottom title="Copy to clipboard"><img src=/images/copy-icon.svg></a><div class=highlight><pre tabindex=0 class=chroma><code class=language-java data-lang=java><span class=line><span class=cl><span class=n>beam</span><span class=o>-</span><span class=n>examples</span><span class=o>-</span><span class=n>bundled</span><span class=o>-</span><span class=n>1</span><span class=o>.</span><span class=na>0</span><span class=o>.</span><span class=na>0</span><span class=o>.</span><span class=na>jar</span></span></span></code></pre></div></div></div><p class=language-java>To run the self-executing JAR on Cloud Dataflow, use the following command.</p><div class='language-java snippet'><div class="notebook-skip code-snippet"><a class=copy type=button data-bs-toggle=tooltip data-bs-placement=bottom title="Copy to clipboard"><img src=/images/copy-icon.svg></a><div class=highlight><pre tabindex=0 class=chroma><code class=language-java data-lang=java><span class=line><span class=cl><span class=n>java</span> <span class=o>-</span><span class=n>jar</span> <span class=n>target</span><span class=o>/</span><span class=n>beam</span><span class=o>-</span><span class=n>examples</span><span class=o>-</span><span class=n>bundled</span><span class=o>-</span><span class=n>1</span><span class=o>.</span><span class=na>0</span><span class=o>.</span><span class=na>0</span><span class=o>.</span><span class=na>jar</span> <span class=err>\</span>
</span></span><span class=line><span class=cl> <span class=o>--</span><span class=n>runner</span><span class=o>=</span><span class=n>DataflowRunner</span> <span class=err>\</span>
</span></span><span class=line><span class=cl> <span class=o>--</span><span class=n>project</span><span class=o>=&lt;</span><span class=n>YOUR_GCP_PROJECT_ID</span><span class=o>&gt;</span> <span class=err>\</span>
</span></span><span class=line><span class=cl> <span class=o>--</span><span class=n>region</span><span class=o>=&lt;</span><span class=n>GCP_REGION</span><span class=o>&gt;</span> <span class=err>\</span>
</span></span><span class=line><span class=cl> <span class=o>--</span><span class=n>tempLocation</span><span class=o>=</span><span class=n>gs</span><span class=o>:</span><span class=c1>//&lt;YOUR_GCS_BUCKET&gt;/temp/ \
</span></span></span><span class=line><span class=cl><span class=c1></span> <span class=o>--</span><span class=n>output</span><span class=o>=</span><span class=n>gs</span><span class=o>:</span><span class=c1>//&lt;YOUR_GCS_BUCKET&gt;/output
</span></span></span></code></pre></div></div></div><h2 id=pipeline-options>Pipeline options for the Cloud Dataflow Runner</h2><p><span class=language-java>When executing your pipeline with the Cloud Dataflow Runner (Java), consider these common pipeline options.</span>
<span class=language-py>When executing your pipeline with the Cloud Dataflow Runner (Python), consider these common pipeline options.</span></p><div class=table-container-wrapper><table class="table table-bordered"><tr><th>Field</th><th>Description</th><th>Default Value</th></tr><tr><td><code>runner</code></td><td>The pipeline runner to use. This option allows you to determine the pipeline runner at runtime.</td><td>Set to <code>dataflow</code> or <code>DataflowRunner</code> to run on the Cloud Dataflow Service.</td></tr><tr><td><code>project</code></td><td>The project ID for your Google Cloud Project.</td><td>If not set, defaults to the default project in the current environment. The default project is set via <code>gcloud</code>.</td></tr><tr><td><code>region</code></td><td>The Google Compute Engine region to create the job.</td><td>If not set, defaults to the default region in the current environment. The default region is set via <code>gcloud</code>.</td></tr><tr><td><code>streaming</code></td><td>Whether streaming mode is enabled or disabled; <code>true</code> if enabled. Set to <code>true</code> if running pipelines with unbounded <code>PCollection</code>s.</td><td><code>false</code></td></tr><tr><td><span class=language-java><code>tempLocation</code></span>
<span class=language-py><code>temp_location</code></span></td><td><span class=language-java>Optional.</span>
<span class=language-py>Required.</span>
Path for temporary files. Must be a valid Google Cloud Storage URL that begins with <code>gs://</code>.
<span class=language-java>If set, <code>tempLocation</code> is used as the default value for <code>gcpTempLocation</code>.</span></td><td>No default value.</td></tr><tr class=language-java><td><code>gcpTempLocation</code></td><td>Cloud Storage bucket path for temporary files. Must be a valid Cloud Storage URL that begins with <code>gs://</code>.</td><td>If not set, defaults to the value of <code>tempLocation</code>, provided that <code>tempLocation</code> is a valid Cloud Storage URL. If <code>tempLocation</code> is not a valid Cloud Storage URL, you must set <code>gcpTempLocation</code>.</td></tr><tr><td><span class=language-java><code>stagingLocation</code></span>
<span class=language-py><code>staging_location</code></span></td><td>Optional. Cloud Storage bucket path for staging your binary and any temporary files. Must be a valid Cloud Storage URL that begins with <code>gs://</code>.</td><td><span class=language-java>If not set, defaults to a staging directory within <code>gcpTempLocation</code>.</span>
<span class=language-py>If not set, defaults to a staging directory within <code>temp_location</code>.</span></td></tr><tr class=language-py><td><code>save_main_session</code></td><td>Save the main session state so that pickled functions and classes defined in <code>__main__</code> (e.g. interactive session) can be unpickled. Some workflows do not need the session state if, for instance, all of their functions/classes are defined in proper modules (not <code>__main__</code>) and the modules are importable in the worker.</td><td><code>false</code></td></tr><tr class=language-py><td><code>sdk_location</code></td><td>Override the default location from where the Beam SDK is downloaded. This value can be a URL, a Cloud Storage path, or a local path to an SDK tarball. Workflow submissions will download or copy the SDK tarball from this location. If set to the string <code>default</code>, a standard SDK location is used. If empty, no SDK is copied.</td><td><code>default</code></td></tr></table></div><p>See the reference documentation for the
<span class=language-java><a href=https://beam.apache.org/releases/javadoc/2.55.1/index.html?org/apache/beam/runners/dataflow/options/DataflowPipelineOptions.html>DataflowPipelineOptions</a></span>
<span class=language-py><a href=https://beam.apache.org/releases/pydoc/2.55.1/apache_beam.options.pipeline_options.html#apache_beam.options.pipeline_options.PipelineOptions><code>PipelineOptions</code></a></span>
interface (and any subinterfaces) for additional pipeline configuration options.</p><h2 id=additional-info>Additional information and caveats</h2><h3 id=monitoring>Monitoring your job</h3><p>While your pipeline executes, you can monitor the job&rsquo;s progress, view details on execution, and receive updates on the pipeline&rsquo;s results by using the <a href=https://cloud.google.com/dataflow/pipelines/dataflow-monitoring-intf>Dataflow Monitoring Interface</a> or the <a href=https://cloud.google.com/dataflow/pipelines/dataflow-command-line-intf>Dataflow Command-line Interface</a>.</p><h3 id=blocking-execution>Blocking Execution</h3><p>To block until your job completes, call <span class=language-java><code>waitToFinish</code></span><span class=language-py><code>wait_until_finish</code></span> on the <code>PipelineResult</code> returned from <code>pipeline.run()</code>. The Cloud Dataflow Runner prints job status updates and console messages while it waits. While the result is connected to the active job, note that pressing <strong>Ctrl+C</strong> from the command line does not cancel your job. To cancel the job, you can use the <a href=https://cloud.google.com/dataflow/pipelines/dataflow-monitoring-intf>Dataflow Monitoring Interface</a> or the <a href=https://cloud.google.com/dataflow/pipelines/dataflow-command-line-intf>Dataflow Command-line Interface</a>.</p><h3 id=streaming-execution>Streaming Execution</h3><p>If your pipeline uses an unbounded data source or sink, you must set the <code>streaming</code> option to <code>true</code>.</p><p>When using streaming execution, keep the following considerations in mind.</p><ol><li><p>Streaming pipelines do not terminate unless explicitly cancelled by the user.
You can cancel your streaming job from the <a href=https://cloud.google.com/dataflow/pipelines/stopping-a-pipeline>Dataflow Monitoring Interface</a>
or with the <a href=https://cloud.google.com/dataflow/pipelines/dataflow-command-line-intf>Dataflow Command-line Interface</a>
(<a href=https://cloud.google.com/sdk/gcloud/reference/dataflow/jobs/cancel>gcloud dataflow jobs cancel</a>
command).</p></li><li><p>Streaming jobs use a Google Compute Engine <a href=https://cloud.google.com/compute/docs/machine-types>machine type</a>
of <code>n1-standard-2</code> or higher by default. You must not override this, as
<code>n1-standard-2</code> is the minimum required machine type for running streaming
jobs.</p></li><li><p>Streaming execution <a href=https://cloud.google.com/dataflow/pricing>pricing</a>
differs from batch execution.</p></li></ol><div class=feedback><p class=update>Last updated on 2024/04/25</p><h3>Have you found everything you were looking for?</h3><p class=description>Was it all useful and clear? Is there anything that you would like to change? Let us know!</p><button class=load-button><a href="https://docs.google.com/forms/d/e/1FAIpQLSfID7abne3GE6k6RdJIyZhPz2Gef7UkpggUEhTIDjjplHuxSA/viewform?usp=header_link" target=_blank>SEND FEEDBACK</a></button></div></div></div><footer class=footer><div class=footer__contained><div class=footer__cols><div class="footer__cols__col footer__cols__col__logos"><div class=footer__cols__col__logo><img src=/images/beam_logo_circle.svg class=footer__logo alt="Beam logo"></div><div class=footer__cols__col__logo><img src=/images/apache_logo_circle.svg class=footer__logo alt="Apache logo"></div></div><div class=footer-wrapper><div class=wrapper-grid><div class=footer__cols__col><div class=footer__cols__col__title>Start</div><div class=footer__cols__col__link><a href=/get-started/beam-overview/>Overview</a></div><div class=footer__cols__col__link><a href=/get-started/quickstart-java/>Quickstart (Java)</a></div><div class=footer__cols__col__link><a href=/get-started/quickstart-py/>Quickstart (Python)</a></div><div class=footer__cols__col__link><a href=/get-started/quickstart-go/>Quickstart (Go)</a></div><div class=footer__cols__col__link><a href=/get-started/downloads/>Downloads</a></div></div><div class=footer__cols__col><div class=footer__cols__col__title>Docs</div><div class=footer__cols__col__link><a href=/documentation/programming-guide/>Concepts</a></div><div class=footer__cols__col__link><a href=/documentation/pipelines/design-your-pipeline/>Pipelines</a></div><div class=footer__cols__col__link><a href=/documentation/runners/capability-matrix/>Runners</a></div></div><div class=footer__cols__col><div class=footer__cols__col__title>Community</div><div class=footer__cols__col__link><a href=/contribute/>Contribute</a></div><div class=footer__cols__col__link><a href=https://projects.apache.org/committee.html?beam target=_blank>Team<img src=/images/external-link-icon.png width=14 height=14 alt="External link."></a></div><div class=footer__cols__col__link><a href=/community/presentation-materials/>Media</a></div><div class=footer__cols__col__link><a href=/community/in-person/>Events/Meetups</a></div><div class=footer__cols__col__link><a href=/community/contact-us/>Contact Us</a></div></div><div class=footer__cols__col><div class=footer__cols__col__title>Resources</div><div class=footer__cols__col__link><a href=/blog/>Blog</a></div><div class=footer__cols__col__link><a href=https://github.com/apache/beam>GitHub</a></div></div></div><div class=footer__bottom>&copy;
<a href=https://www.apache.org>The Apache Software Foundation</a>
| <a href=/privacy_policy>Privacy Policy</a>
| <a href=/feed.xml>RSS Feed</a><br><br>Apache Beam, Apache, Beam, the Beam logo, and the Apache feather logo are either registered trademarks or trademarks of The Apache Software Foundation. All other products or name brands are trademarks of their respective holders, including The Apache Software Foundation.</div></div><div class="footer__cols__col footer__cols__col__logos"><div class=footer__cols__col--group><div class=footer__cols__col__logo><a href=https://github.com/apache/beam><img src=/images/logos/social-icons/github-logo-150.png class=footer__logo alt="Github logo"></a></div><div class=footer__cols__col__logo><a href=https://www.linkedin.com/company/apache-beam/><img src=/images/logos/social-icons/linkedin-logo-150.png class=footer__logo alt="Linkedin logo"></a></div></div><div class=footer__cols__col--group><div class=footer__cols__col__logo><a href=https://twitter.com/apachebeam><img src=/images/logos/social-icons/twitter-logo-150.png class=footer__logo alt="Twitter logo"></a></div><div class=footer__cols__col__logo><a href=https://www.youtube.com/channel/UChNnb_YO_7B0HlW6FhAXZZQ><img src=/images/logos/social-icons/youtube-logo-150.png class=footer__logo alt="Youtube logo"></a></div></div></div></div></div></footer></body></html>