blob: 00a559dac453082e136d7932393045c6b0892c2b [file] [log] [blame]
<!doctype html><html lang=en class=no-js><head><meta charset=utf-8><meta http-equiv=x-ua-compatible content="IE=edge"><meta name=viewport content="width=device-width,initial-scale=1"><title>Getting started from Apache Spark</title><meta name=description content="Apache Beam is an open source, unified model and set of language-specific SDKs for defining and executing data processing workflows, and also data ingestion and integration flows, supporting Enterprise Integration Patterns (EIPs) and Domain Specific Languages (DSLs). Dataflow pipelines simplify the mechanics of large-scale batch and streaming data processing and can run on a number of runtimes like Apache Flink, Apache Spark, and Google Cloud Dataflow (a cloud service). Beam also brings DSL in different languages, allowing users to easily implement their data integration processes."><link href="https://fonts.googleapis.com/css?family=Roboto:100,300,400,500,700" rel=stylesheet><link rel=preload href=/scss/main.min.408fddfe3e8a45f87a5a8c9a839d77db667c1c534e5e5cd0d957ffc3dd6c14cf.css as=style><link href=/scss/main.min.408fddfe3e8a45f87a5a8c9a839d77db667c1c534e5e5cd0d957ffc3dd6c14cf.css rel=stylesheet integrity><script src=https://code.jquery.com/jquery-2.2.4.min.js></script><style>.body__contained img{max-width:100%}</style><script type=text/javascript src=/js/bootstrap.min.2979f9a6e32fc42c3e7406339ee9fe76b31d1b52059776a02b4a7fa6a4fd280a.js defer></script>
<script type=text/javascript src=/js/language-switch-v2.min.121952b7980b920320ab229551857669209945e39b05ba2b433a565385ca44c6.js defer></script>
<script type=text/javascript src=/js/fix-menu.min.039174b67107465f2090a493f91e126f7aa797f29420f9edab8a54d9dd4b3d2d.js defer></script>
<script type=text/javascript src=/js/section-nav.min.1405fd5e70fab5f6c54037c269b1d137487d8f3d1b3009032525f6db3fbce991.js defer></script>
<script type=text/javascript src=/js/page-nav.min.af231204c9c52c5089d53a4c02739eacbb7f939e3be1c6ffcc212e0ac4dbf879.js defer></script>
<script type=text/javascript src=/js/expandable-list.min.75a4526624a3b8898fe7fb9e3428c205b581f8b38c7926922467aef17eac69f2.js defer></script>
<script type=text/javascript src=/js/copy-to-clipboard.min.364c06423d7e8993fc42bb4abc38c03195bc8386db26d18774ce775d08d5b18d.js defer></script>
<script type=text/javascript src=/js/calendar.min.336664054fa0f52b08bbd4e3c59b5cb6d63dcfb2b4d602839746516b0817446b.js defer></script>
<script type=text/javascript src=/js/fix-playground-nested-scroll.min.0283f1037cb1b9d5074c6eaf041292b524a8148a7cdb803d5ccd6d1fc4eb3253.js defer></script>
<script type=text/javascript src=/js/anchor-content-jump-fix.min.22d3240f81632e4c11179b9d2aaf37a40da9414333c43aa97344e8b21a7df0e4.js defer></script>
<link rel=alternate type=application/rss+xml title="Apache Beam" href=/feed.xml><link rel=canonical href=/get-started/from-spark/ data-proofer-ignore><link rel="shortcut icon" type=image/x-icon href=/images/favicon.ico><link rel=stylesheet href=https://use.fontawesome.com/releases/v5.4.1/css/all.css integrity=sha384-5sAR7xN1Nv6T6+dT2mhtzEpVJvfS3NScPQTrOxhwjIuvcA67KV2R5Jz6kr4abQsz crossorigin=anonymous><link rel=stylesheet href=https://unpkg.com/swiper@8/swiper-bundle.min.css><script async src=https://platform.twitter.com/widgets.js></script>
<script>(function(e,t,n,s,o,i,a){e.GoogleAnalyticsObject=o,e[o]=e[o]||function(){(e[o].q=e[o].q||[]).push(arguments)},e[o].l=1*new Date,i=t.createElement(n),a=t.getElementsByTagName(n)[0],i.async=1,i.src=s,a.parentNode.insertBefore(i,a)})(window,document,"script","//www.google-analytics.com/analytics.js","ga"),ga("create","UA-73650088-1","auto"),ga("send","pageview")</script><script>(function(e,t,n,s,o,i){e.hj=e.hj||function(){(e.hj.q=e.hj.q||[]).push(arguments)},e._hjSettings={hjid:2182187,hjsv:6},o=t.getElementsByTagName("head")[0],i=t.createElement("script"),i.async=1,i.src=n+e._hjSettings.hjid+s+e._hjSettings.hjsv,o.appendChild(i)})(window,document,"https://static.hotjar.com/c/hotjar-",".js?sv=")</script></head><body class=body data-spy=scroll data-target=.page-nav data-offset=0><nav class="navigation-bar-mobile header navbar navbar-fixed-top"><div class=navbar-header><a href=/ class=navbar-brand><img alt=Brand style=height:46px;width:43px src=/images/beam_logo_navbar_mobile.png></a>
<a class=navbar-link href=/get-started/>Get Started</a>
<a class=navbar-link href=/documentation/>Documentation</a>
<button type=button class="navbar-toggle menu-open" aria-expanded=false aria-controls=navbar onclick=openMenu()>
<span class=sr-only>Toggle navigation</span>
<span class=icon-bar></span>
<span class=icon-bar></span>
<span class=icon-bar></span></button></div><div class="navbar-mask closed"></div><div id=navbar class="navbar-container closed"><button type=button class=navbar-toggle aria-expanded=false aria-controls=navbar id=closeMenu>
<span class=sr-only>Toggle navigation</span>
<span class=icon-bar></span>
<span class=icon-bar></span>
<span class=icon-bar></span></button><ul class="nav navbar-nav"><li><div class=searchBar-mobile><script>(function(){var t,n="012923275103528129024:4emlchv9wzi",e=document.createElement("script");e.type="text/javascript",e.async=!0,e.src="https://cse.google.com/cse.js?cx="+n,t=document.getElementsByTagName("script")[0],t.parentNode.insertBefore(e,t)})()</script><gcse:search></gcse:search></div></li><li><a class=navbar-link href=/about>About</a></li><li><a class=navbar-link href=/get-started/>Get Started</a></li><li><span class=navbar-link>Documentation</span><ul><li><a href=/documentation/>General</a></li><li><a href=/documentation/sdks/java/>Languages</a></li><li><a href=/documentation/runners/capability-matrix/>Runners</a></li><li><a href=/documentation/io/connectors/>I/O Connectors</a></li></ul></li><li><a class=navbar-link href=/roadmap/>Roadmap</a></li><li><a class=navbar-link href=/community/>Community</a></li><li><a class=navbar-link href=/contribute/>Contribute</a></li><li><a class=navbar-link href=/blog/>Blog</a></li><li><a class=navbar-link href=/case-studies/>Case Studies</a></li></ul><ul class="nav navbar-nav navbar-right"><li><a href=https://github.com/apache/beam/edit/master/website/www/site/content/en/get-started/from-spark.md data-proofer-ignore><svg xmlns="http://www.w3.org/2000/svg" width="25" height="24" fill="none" viewBox="0 0 25 24"><path stroke="#ff6d00" stroke-linecap="round" stroke-linejoin="round" stroke-width="2.75" d="M4.543 20h4l10.5-10.5c.53-.53.828-1.25.828-2s-.298-1.47-.828-2-1.25-.828-2-.828-1.47.298-2 .828L4.543 16v4zm9.5-13.5 4 4"/></svg></a></li><li class=dropdown><a href=# class=dropdown-toggle id=apache-dropdown data-toggle=dropdown role=button aria-haspopup=true aria-expanded=false><img src=https://www.apache.org/foundation/press/kit/feather_small.png alt="Apache Logo" style=height:20px>
&nbsp;Apache
<span class=arrow-icon><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" fill="none" viewBox="0 0 20 20"><circle cx="10" cy="10" r="10" fill="#ff6d00"/><path stroke="#fff" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M8.535 5.28l4.573 4.818-4.573 4.403"/></svg></span></a><ul class="dropdown-menu dropdown-menu-right"><li><a target=_blank href=https://www.apache.org/>ASF Homepage</a></li><li><a target=_blank href=https://www.apache.org/licenses/>License</a></li><li><a target=_blank href=https://www.apache.org/security/>Security</a></li><li><a target=_blank href=https://www.apache.org/foundation/thanks.html>Thanks</a></li><li><a target=_blank href=https://www.apache.org/foundation/sponsorship.html>Sponsorship</a></li><li><a target=_blank href=https://www.apache.org/foundation/policies/conduct>Code of Conduct</a></li></ul></li></ul></div></nav><nav class=navigation-bar-desktop><a href=/ class=navbar-logo><img src=/images/beam_logo_navbar.png alt="Beam Logo"></a><div class=navbar-bar-left><div class=navbar-links><a class=navbar-link href=/about>About</a>
<a class=navbar-link href=/get-started/>Get Started</a><li class="dropdown navbar-dropdown navbar-dropdown-documentation"><a href=# class="dropdown-toggle navbar-link" role=button aria-haspopup=true aria-expanded=false>Documentation
<span><svg xmlns="http://www.w3.org/2000/svg" width="12" height="11" fill="none" viewBox="0 0 12 11"><path stroke="#ff6d00" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M10.666 4.535 5.847 9.108 1.444 4.535"/></svg></span></a><ul class=dropdown-menu><li><a class=navbar-dropdown-menu-link href=/documentation/>General</a></li><li><a class=navbar-dropdown-menu-link href=/documentation/sdks/java/>Languages</a></li><li><a class=navbar-dropdown-menu-link href=/documentation/runners/capability-matrix/>Runners</a></li><li><a class=navbar-dropdown-menu-link href=/documentation/io/connectors/>I/O Connectors</a></li></ul></li><a class=navbar-link href=/roadmap/>Roadmap</a>
<a class=navbar-link href=/community/>Community</a>
<a class=navbar-link href=/contribute/>Contribute</a>
<a class=navbar-link href=/blog/>Blog</a>
<a class=navbar-link href=/case-studies/>Case Studies</a></div><div id=iconsBar><a type=button onclick=showSearch()><svg xmlns="http://www.w3.org/2000/svg" width="25" height="24" fill="none" viewBox="0 0 25 24"><path stroke="#ff6d00" stroke-linecap="round" stroke-linejoin="round" stroke-width="2.75" d="M10.191 17c3.866.0 7-3.134 7-7s-3.134-7-7-7-7 3.134-7 7 3.134 7 7 7zm11 4-6-6"/></svg></a><a target=_blank href=https://github.com/apache/beam/edit/master/website/www/site/content/en/get-started/from-spark.md data-proofer-ignore><svg xmlns="http://www.w3.org/2000/svg" width="25" height="24" fill="none" viewBox="0 0 25 24"><path stroke="#ff6d00" stroke-linecap="round" stroke-linejoin="round" stroke-width="2.75" d="M4.543 20h4l10.5-10.5c.53-.53.828-1.25.828-2s-.298-1.47-.828-2-1.25-.828-2-.828-1.47.298-2 .828L4.543 16v4zm9.5-13.5 4 4"/></svg></a><li class="dropdown navbar-dropdown navbar-dropdown-apache"><a href=# class=dropdown-toggle role=button aria-haspopup=true aria-expanded=false><img src=https://www.apache.org/foundation/press/kit/feather_small.png alt="Apache Logo" style=height:20px>
&nbsp;Apache
<span class=arrow-icon><svg xmlns="http://www.w3.org/2000/svg" width="20" height="20" fill="none" viewBox="0 0 20 20"><circle cx="10" cy="10" r="10" fill="#ff6d00"/><path stroke="#fff" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M8.535 5.28l4.573 4.818-4.573 4.403"/></svg></span></a><ul class=dropdown-menu><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/>ASF Homepage</a></li><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/licenses/>License</a></li><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/security/>Security</a></li><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/foundation/thanks.html>Thanks</a></li><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/foundation/sponsorship.html>Sponsorship</a></li><li><a class=navbar-dropdown-menu-link target=_blank href=https://www.apache.org/foundation/policies/conduct>Code of Conduct</a></li></ul></li></div><div class="searchBar disappear"><script>(function(){var t,n="012923275103528129024:4emlchv9wzi",e=document.createElement("script");e.type="text/javascript",e.async=!0,e.src="https://cse.google.com/cse.js?cx="+n,t=document.getElementsByTagName("script")[0],t.parentNode.insertBefore(e,t)})()</script><gcse:search></gcse:search>
<a type=button onclick=endSearch()><svg xmlns="http://www.w3.org/2000/svg" width="25" height="25" fill="none" viewBox="0 0 25 25"><path stroke="#ff6d00" stroke-linecap="round" stroke-linejoin="round" stroke-width="2.75" d="M21.122 20.827 4.727 4.432M21.122 4.43 4.727 20.827"/></svg></a></div></div></nav><div class=header-push></div><div class="top-banners swiper"><div class=swiper-wrapper><div class=swiper-slide><a href=https://tour.beam.apache.org><img class=banner-img-desktop src=/images/banners/tour-of-beam/tour-of-beam-desktop.png alt="Start Tour of Beam">
<img class=banner-img-mobile src=/images/banners/tour-of-beam/tour-of-beam-mobile.png alt="Start Tour of Beam"></a></div><div class=swiper-slide><a href=https://beam.apache.org/documentation/ml/overview/><img class=banner-img-desktop src=/images/banners/machine-learning/machine-learning-desktop.jpg alt="Machine Learning">
<img class=banner-img-mobile src=/images/banners/machine-learning/machine-learning-mobile.jpg alt="Machine Learning"></a></div></div><div class=swiper-pagination></div><div class=swiper-button-prev></div><div class=swiper-button-next></div></div><script src=/js/swiper-bundle.min.min.e0e8f81b0b15728d35ff73c07f42ddbb17a108d6f23df4953cb3e60df7ade675.js></script>
<script src=/js/sliders/top-banners.min.afa7d0a19acf7a3b28ca369490b3d401a619562a2a4c9612577be2f66a4b9855.js></script>
<script>function showSearch(){addPlaceholder();var e,t=document.querySelector(".searchBar");t.classList.remove("disappear"),e=document.querySelector("#iconsBar"),e.classList.add("disappear")}function addPlaceholder(){$("input:text").attr("placeholder","What are you looking for?")}function endSearch(){var e,t=document.querySelector(".searchBar");t.classList.add("disappear"),e=document.querySelector("#iconsBar"),e.classList.remove("disappear")}function blockScroll(){$("body").toggleClass("fixedPosition")}function openMenu(){addPlaceholder(),blockScroll()}</script><div class="clearfix container-main-content"><div class="section-nav closed" data-offset-top=90 data-offset-bottom=500><span class="section-nav-back glyphicon glyphicon-menu-left"></span><nav><ul class=section-nav-list data-section-nav><li><span class=section-nav-list-main-title>Get started</span></li><li><a href=/get-started/beam-overview/>Beam Overview</a></li><li><a href=/get-started/an-interactive-overview-of-beam/>An Interactive Overview of Beam</a></li><li><span class=section-nav-list-title>Quickstarts</span><ul class=section-nav-list><li><a href=https://tour.beam.apache.org>Tour of Beam</a></li><li><a href=/get-started/try-apache-beam/>Try Apache Beam</a></li><li><a href=/get-started/try-beam-playground/>Try Beam Playground</a></li><li><a href=/get-started/quickstart/java/>Java quickstart</a></li><li><a href=/get-started/quickstart/python/>Python quickstart</a></li><li><a href=/get-started/quickstart/go/>Go quickstart</a></li><li><a href=/get-started/quickstart/typescript/>Typescript quickstart</a></li><li><a href=/get-started/from-spark/>Apache Spark</a></li><li><a href=/get-started/quickstart-java/>WordCount (Java)</a></li><li><a href=/get-started/quickstart-py/>WordCount (Python)</a></li><li><a href=/get-started/quickstart-go/>WordCount (Go)</a></li></ul></li><li><a href=/get-started/downloads>Install the SDK</a></li><li><span class=section-nav-list-title>Tutorials</span><ul class=section-nav-list><li><a href=/get-started/wordcount-example/>WordCount</a></li><li><a href=/get-started/mobile-gaming-example/>Mobile Gaming</a></li></ul></li><li class=section-nav-item--collapsible><span class=section-nav-list-title>Learning resources</span><ul class=section-nav-list><li><a href=/get-started/resources/learning-resources/#getting-started>Getting Started</a></li><li><a href=/get-started/resources/learning-resources/#articles>Articles</a></li><li><a href=/get-started/resources/learning-resources/#videos>Videos</a></li><li><a href=/get-started/resources/learning-resources/#courses>Courses</a></li><li><a href=/get-started/resources/learning-resources/#books>Books</a></li><li><a href=/get-started/resources/learning-resources/#certifications>Certifications</a></li><li><a href=/get-started/resources/learning-resources/#interactive-labs>Interactive Labs</a></li><li><a href=/get-started/resources/learning-resources/#beam-katas>Beam Katas</a></li><li><a href=/get-started/resources/learning-resources/#code-examples>Code Examples</a></li><li><a href=/get-started/resources/learning-resources/#api-reference>API Reference</a></li><li><a href=/get-started/resources/learning-resources/#feedback-and-suggestions>Feedback and Suggestions</a></li><li><a href=/get-started/resources/learning-resources/#how-to-contribute>How to Contribute</a></li><li><a href=/get-started/resources/videos-and-podcasts>Videos and Podcasts</a></li></ul></li><li><a href=/security>Security</a></li></ul></nav></div><nav class="page-nav clearfix" data-offset-top=90 data-offset-bottom=500><nav id=TableOfContents><ul><li><a href=#overview>Overview</a></li><li><a href=#setup>Setup</a></li><li><a href=#transforms>Transforms</a></li><li><a href=#using-calculated-values>Using calculated values</a></li><li><a href=#next-steps>Next Steps</a></li></ul></nav></nav><div class="body__contained body__section-nav"><h1 id=getting-started-from-apache-spark>Getting started from Apache Spark</h1><script type=text/javascript>localStorage.setItem("language","language-py")</script><p>If you already know <a href=https://spark.apache.org/><em>Apache Spark</em></a>,
using Beam should be easy.
The basic concepts are the same, and the APIs are similar as well.</p><p>Spark stores data <em>Spark DataFrames</em> for structured data,
and in <em>Resilient Distributed Datasets</em> (RDD) for unstructured data.
We are using RDDs for this guide.</p><p>A Spark RDD represents a collection of elements,
while in Beam it&rsquo;s called a <em>Parallel Collection</em> (PCollection).
A PCollection in Beam does <em>not</em> have any ordering guarantees.</p><p>Likewise, a transform in Beam is called a <em>Parallel Transform</em> (PTransform).</p><p>Here are some examples of common operations and their equivalent between PySpark and Beam.</p><h2 id=overview>Overview</h2><p>Here&rsquo;s a simple example of a PySpark pipeline that takes the numbers from one to four,
multiplies them by two, adds all the values together, and prints the result.</p><div class='language-py snippet'><div class="notebook-skip code-snippet"><a class=copy type=button data-bs-toggle=tooltip data-bs-placement=bottom title="Copy to clipboard"><img src=/images/copy-icon.svg></a><div class=highlight><pre tabindex=0 class=chroma><code class=language-py data-lang=py><span class=line><span class=cl><span class=kn>import</span> <span class=nn>pyspark</span>
</span></span><span class=line><span class=cl>
</span></span><span class=line><span class=cl><span class=n>sc</span> <span class=o>=</span> <span class=n>pyspark</span><span class=o>.</span><span class=n>SparkContext</span><span class=p>()</span>
</span></span><span class=line><span class=cl><span class=n>result</span> <span class=o>=</span> <span class=p>(</span>
</span></span><span class=line><span class=cl> <span class=n>sc</span><span class=o>.</span><span class=n>parallelize</span><span class=p>([</span><span class=mi>1</span><span class=p>,</span> <span class=mi>2</span><span class=p>,</span> <span class=mi>3</span><span class=p>,</span> <span class=mi>4</span><span class=p>])</span>
</span></span><span class=line><span class=cl> <span class=o>.</span><span class=n>map</span><span class=p>(</span><span class=k>lambda</span> <span class=n>x</span><span class=p>:</span> <span class=n>x</span> <span class=o>*</span> <span class=mi>2</span><span class=p>)</span>
</span></span><span class=line><span class=cl> <span class=o>.</span><span class=n>reduce</span><span class=p>(</span><span class=k>lambda</span> <span class=n>x</span><span class=p>,</span> <span class=n>y</span><span class=p>:</span> <span class=n>x</span> <span class=o>+</span> <span class=n>y</span><span class=p>)</span>
</span></span><span class=line><span class=cl><span class=p>)</span>
</span></span><span class=line><span class=cl><span class=nb>print</span><span class=p>(</span><span class=n>result</span><span class=p>)</span></span></span></code></pre></div></div></div><p>In Beam you pipe your data through the pipeline using the
<em>pipe operator</em> <code>|</code> like <code>data | beam.Map(...)</code> instead of chaining
methods like <code>data.map(...)</code>, but they&rsquo;re doing the same thing.</p><p>Here&rsquo;s what an equivalent pipeline looks like in Beam.</p><div class='language-py snippet'><div class="notebook-skip code-snippet"><a class=copy type=button data-bs-toggle=tooltip data-bs-placement=bottom title="Copy to clipboard"><img src=/images/copy-icon.svg></a><div class=highlight><pre tabindex=0 class=chroma><code class=language-py data-lang=py><span class=line><span class=cl><span class=kn>import</span> <span class=nn>apache_beam</span> <span class=k>as</span> <span class=nn>beam</span>
</span></span><span class=line><span class=cl>
</span></span><span class=line><span class=cl><span class=k>with</span> <span class=n>beam</span><span class=o>.</span><span class=n>Pipeline</span><span class=p>()</span> <span class=k>as</span> <span class=n>pipeline</span><span class=p>:</span>
</span></span><span class=line><span class=cl> <span class=n>result</span> <span class=o>=</span> <span class=p>(</span>
</span></span><span class=line><span class=cl> <span class=n>pipeline</span>
</span></span><span class=line><span class=cl> <span class=o>|</span> <span class=n>beam</span><span class=o>.</span><span class=n>Create</span><span class=p>([</span><span class=mi>1</span><span class=p>,</span> <span class=mi>2</span><span class=p>,</span> <span class=mi>3</span><span class=p>,</span> <span class=mi>4</span><span class=p>])</span>
</span></span><span class=line><span class=cl> <span class=o>|</span> <span class=n>beam</span><span class=o>.</span><span class=n>Map</span><span class=p>(</span><span class=k>lambda</span> <span class=n>x</span><span class=p>:</span> <span class=n>x</span> <span class=o>*</span> <span class=mi>2</span><span class=p>)</span>
</span></span><span class=line><span class=cl> <span class=o>|</span> <span class=n>beam</span><span class=o>.</span><span class=n>CombineGlobally</span><span class=p>(</span><span class=nb>sum</span><span class=p>)</span>
</span></span><span class=line><span class=cl> <span class=o>|</span> <span class=n>beam</span><span class=o>.</span><span class=n>Map</span><span class=p>(</span><span class=nb>print</span><span class=p>)</span>
</span></span><span class=line><span class=cl> <span class=p>)</span></span></span></code></pre></div></div></div><blockquote><p>ℹ️ Note that we called <code>print</code> inside a <code>Map</code> transform.
That&rsquo;s because we can only access the elements of a PCollection
from within a PTransform.
To inspect the data locally, you can use the <a href=https://cloud.google.com/dataflow/docs/guides/interactive-pipeline-development#creating_your_pipeline>InteractiveRunner</a></p></blockquote><p>Another thing to note is that Beam pipelines are constructed lazily.
This means that when you pipe <code>|</code> data you&rsquo;re only declaring the
transformations and the order you want them to happen,
but the actual computation doesn&rsquo;t happen.
The pipeline is run after the <code>with beam.Pipeline() as pipeline</code> context has
closed.</p><blockquote><p>ℹ️ When the <code>with beam.Pipeline() as pipeline</code> context closes,
it implicitly calls <code>pipeline.run()</code> which triggers the computation to happen.</p></blockquote><p>The pipeline is then sent to your
<a href=/documentation/runners/capability-matrix/>runner of choice</a>
and it processes the data.</p><blockquote><p>ℹ️ The pipeline can run locally with the <em>DirectRunner</em>,
or in a distributed runner such as Flink, Spark, or Dataflow.
The Spark runner is not related to PySpark.</p></blockquote><p>A label can optionally be added to a transform using the
<em>right shift operator</em> <code>>></code> like <code>data | 'My description' >> beam.Map(...)</code>.
This serves both as comments and makes your pipeline easier to debug.</p><p>This is how the pipeline looks after adding labels.</p><div class='language-py snippet'><div class="notebook-skip code-snippet"><a class=copy type=button data-bs-toggle=tooltip data-bs-placement=bottom title="Copy to clipboard"><img src=/images/copy-icon.svg></a><div class=highlight><pre tabindex=0 class=chroma><code class=language-py data-lang=py><span class=line><span class=cl><span class=kn>import</span> <span class=nn>apache_beam</span> <span class=k>as</span> <span class=nn>beam</span>
</span></span><span class=line><span class=cl>
</span></span><span class=line><span class=cl><span class=k>with</span> <span class=n>beam</span><span class=o>.</span><span class=n>Pipeline</span><span class=p>()</span> <span class=k>as</span> <span class=n>pipeline</span><span class=p>:</span>
</span></span><span class=line><span class=cl> <span class=n>result</span> <span class=o>=</span> <span class=p>(</span>
</span></span><span class=line><span class=cl> <span class=n>pipeline</span>
</span></span><span class=line><span class=cl> <span class=o>|</span> <span class=s1>&#39;Create numbers&#39;</span> <span class=o>&gt;&gt;</span> <span class=n>beam</span><span class=o>.</span><span class=n>Create</span><span class=p>([</span><span class=mi>1</span><span class=p>,</span> <span class=mi>2</span><span class=p>,</span> <span class=mi>3</span><span class=p>,</span> <span class=mi>4</span><span class=p>])</span>
</span></span><span class=line><span class=cl> <span class=o>|</span> <span class=s1>&#39;Multiply by two&#39;</span> <span class=o>&gt;&gt;</span> <span class=n>beam</span><span class=o>.</span><span class=n>Map</span><span class=p>(</span><span class=k>lambda</span> <span class=n>x</span><span class=p>:</span> <span class=n>x</span> <span class=o>*</span> <span class=mi>2</span><span class=p>)</span>
</span></span><span class=line><span class=cl> <span class=o>|</span> <span class=s1>&#39;Sum everything&#39;</span> <span class=o>&gt;&gt;</span> <span class=n>beam</span><span class=o>.</span><span class=n>CombineGlobally</span><span class=p>(</span><span class=nb>sum</span><span class=p>)</span>
</span></span><span class=line><span class=cl> <span class=o>|</span> <span class=s1>&#39;Print results&#39;</span> <span class=o>&gt;&gt;</span> <span class=n>beam</span><span class=o>.</span><span class=n>Map</span><span class=p>(</span><span class=nb>print</span><span class=p>)</span>
</span></span><span class=line><span class=cl> <span class=p>)</span></span></span></code></pre></div></div></div><h2 id=setup>Setup</h2><p>Here&rsquo;s a comparison on how to get started both in PySpark and Beam.</p><div class=table-container-wrapper><div class=table-wrapper><table style=width:100% class=table-wrapper--equal-p><tr><th style=width:20%></th><th style=width:40%>PySpark</th><th style=width:40%>Beam</th></tr><tr><td><b>Install</b></td><td><code>$ pip install pyspark</code></td><td><code>$ pip install apache-beam</code></td></tr><tr><td><b>Imports</b></td><td><code>import pyspark</code></td><td><code>import apache_beam as beam</code></td></tr><tr><td><b>Creating a<br>local pipeline</b></td><td><code>sc = pyspark.SparkContext() as sc:</code><br><code># Your pipeline code here.</code></td><td><code>with beam.Pipeline() as pipeline:</code><br><code>&nbsp;&nbsp;&nbsp;&nbsp;# Your pipeline code here.</code></td></tr><tr><td><b>Creating values</b></td><td><code>values = sc.parallelize([1, 2, 3, 4])</code></td><td><code>values = pipeline | beam.Create([1, 2, 3, 4])</code></td></tr><tr><td><b>Creating<br>key-value pairs</b></td><td><code>pairs = sc.parallelize([</code><br><code>&nbsp;&nbsp;&nbsp;&nbsp;('key1', 'value1'),</code><br><code>&nbsp;&nbsp;&nbsp;&nbsp;('key2', 'value2'),</code><br><code>&nbsp;&nbsp;&nbsp;&nbsp;('key3', 'value3'),</code><br><code>])</code></td><td><code>pairs = pipeline | beam.Create([</code><br><code>&nbsp;&nbsp;&nbsp;&nbsp;('key1', 'value1'),</code><br><code>&nbsp;&nbsp;&nbsp;&nbsp;('key2', 'value2'),</code><br><code>&nbsp;&nbsp;&nbsp;&nbsp;('key3', 'value3'),</code><br><code>])</code></td></tr><tr><td><b>Running a<br>local pipeline</b></td><td><code>$ spark-submit spark_pipeline.py</code></td><td><code>$ python beam_pipeline.py</code></td></tr></table></div></div><h2 id=transforms>Transforms</h2><p>Here are the equivalents of some common transforms in both PySpark and Beam.</p><div class=table-container-wrapper><div class=table-wrapper><table style=width:100% class=table-wrapper--equal-p><tr><th style=width:20%></th><th style=width:40%>PySpark</th><th style=width:40%>Beam</th></tr><tr><td><b><a href=/documentation/transforms/python/elementwise/map/>Map</a></b></td><td><code>values.map(lambda x: x * 2)</code></td><td><code>values | beam.Map(lambda x: x * 2)</code></td></tr><tr><td><b><a href=/documentation/transforms/python/elementwise/filter/>Filter</a></b></td><td><code>values.filter(lambda x: x % 2 == 0)</code></td><td><code>values | beam.Filter(lambda x: x % 2 == 0)</code></td></tr><tr><td><b><a href=/documentation/transforms/python/elementwise/flatmap/>FlatMap</a></b></td><td><code>values.flatMap(lambda x: range(x))</code></td><td><code>values | beam.FlatMap(lambda x: range(x))</code></td></tr><tr><td><b><a href=/documentation/transforms/python/aggregation/groupbykey/>Group by key</a></b></td><td><code>pairs.groupByKey()</code></td><td><code>pairs | beam.GroupByKey()</code></td></tr><tr><td><b><a href=/documentation/transforms/python/aggregation/combineglobally/>Reduce</a></b></td><td><code>values.reduce(lambda x, y: x+y)</code></td><td><code>values | beam.CombineGlobally(sum)</code></td></tr><tr><td><b><a href=/documentation/transforms/python/aggregation/combineperkey/>Reduce by key</a></b></td><td><code>pairs.reduceByKey(lambda x, y: x+y)</code></td><td><code>pairs | beam.CombinePerKey(sum)</code></td></tr><tr><td><b><a href=/documentation/transforms/python/aggregation/distinct/>Distinct</a></b></td><td><code>values.distinct()</code></td><td><code>values | beam.Distinct()</code></td></tr><tr><td><b><a href=/documentation/transforms/python/aggregation/count/>Count</a></b></td><td><code>values.count()</code></td><td><code>values | beam.combiners.Count.Globally()</code></td></tr><tr><td><b><a href=/documentation/transforms/python/aggregation/count/>Count by key</a></b></td><td><code>pairs.countByKey()</code></td><td><code>pairs | beam.combiners.Count.PerKey()</code></td></tr><tr><td><b><a href=/documentation/transforms/python/aggregation/top/>Take smallest</a></b></td><td><code>values.takeOrdered(3)</code></td><td><code>values | beam.combiners.Top.Smallest(3)</code></td></tr><tr><td><b><a href=/documentation/transforms/python/aggregation/top/>Take largest</a></b></td><td><code>values.takeOrdered(3, lambda x: -x)</code></td><td><code>values | beam.combiners.Top.Largest(3)</code></td></tr><tr><td><b><a href=/documentation/transforms/python/aggregation/sample/>Random sample</a></b></td><td><code>values.takeSample(False, 3)</code></td><td><code>values | beam.combiners.Sample.FixedSizeGlobally(3)</code></td></tr><tr><td><b><a href=/documentation/transforms/python/other/flatten/>Union</a></b></td><td><code>values.union(otherValues)</code></td><td><code>(values, otherValues) | beam.Flatten()</code></td></tr><tr><td><b><a href=/documentation/transforms/python/aggregation/cogroupbykey/>Co-group</a></b></td><td><code>pairs.cogroup(otherPairs)</code></td><td><code>{'Xs': pairs, 'Ys': otherPairs} | beam.CoGroupByKey()</code></td></tr></table></div></div><blockquote><p>ℹ️ To learn more about the transforms available in Beam, check the
<a href=/documentation/transforms/python/overview>Python transform gallery</a>.</p></blockquote><h2 id=using-calculated-values>Using calculated values</h2><p>Since we are working in potentially distributed environments,
we can&rsquo;t guarantee that the results we&rsquo;ve calculated are available at any given machine.</p><p>In PySpark, we can get a result from a collection of elements (RDD) by using
<code>data.collect()</code>, or other aggregations such as <code>reduce()</code>, <code>count()</code>, and more.</p><p>Here&rsquo;s an example to scale numbers into a range between zero and one.</p><div class='language-py snippet'><div class="notebook-skip code-snippet"><a class=copy type=button data-bs-toggle=tooltip data-bs-placement=bottom title="Copy to clipboard"><img src=/images/copy-icon.svg></a><div class=highlight><pre tabindex=0 class=chroma><code class=language-py data-lang=py><span class=line><span class=cl><span class=kn>import</span> <span class=nn>pyspark</span>
</span></span><span class=line><span class=cl>
</span></span><span class=line><span class=cl><span class=n>sc</span> <span class=o>=</span> <span class=n>pyspark</span><span class=o>.</span><span class=n>SparkContext</span><span class=p>()</span>
</span></span><span class=line><span class=cl><span class=n>values</span> <span class=o>=</span> <span class=n>sc</span><span class=o>.</span><span class=n>parallelize</span><span class=p>([</span><span class=mi>1</span><span class=p>,</span> <span class=mi>2</span><span class=p>,</span> <span class=mi>3</span><span class=p>,</span> <span class=mi>4</span><span class=p>])</span>
</span></span><span class=line><span class=cl><span class=n>min_value</span> <span class=o>=</span> <span class=n>values</span><span class=o>.</span><span class=n>reduce</span><span class=p>(</span><span class=nb>min</span><span class=p>)</span>
</span></span><span class=line><span class=cl><span class=n>max_value</span> <span class=o>=</span> <span class=n>values</span><span class=o>.</span><span class=n>reduce</span><span class=p>(</span><span class=nb>max</span><span class=p>)</span>
</span></span><span class=line><span class=cl>
</span></span><span class=line><span class=cl><span class=c1># We can simply use `min_value` and `max_value` since it&#39;s already a Python `int` value from `reduce`.</span>
</span></span><span class=line><span class=cl><span class=n>scaled_values</span> <span class=o>=</span> <span class=n>values</span><span class=o>.</span><span class=n>map</span><span class=p>(</span><span class=k>lambda</span> <span class=n>x</span><span class=p>:</span> <span class=p>(</span><span class=n>x</span> <span class=o>-</span> <span class=n>min_value</span><span class=p>)</span> <span class=o>/</span> <span class=p>(</span><span class=n>max_value</span> <span class=o>-</span> <span class=n>min_value</span><span class=p>))</span>
</span></span><span class=line><span class=cl>
</span></span><span class=line><span class=cl><span class=c1># But to access `scaled_values`, we need to call `collect`.</span>
</span></span><span class=line><span class=cl><span class=nb>print</span><span class=p>(</span><span class=n>scaled_values</span><span class=o>.</span><span class=n>collect</span><span class=p>())</span></span></span></code></pre></div></div></div><p>In Beam the results from all transforms result in a PCollection.
We use <a href=/documentation/programming-guide/#side-inputs><em>side inputs</em></a>
to feed a PCollection into a transform and access its values.</p><p>Any transform that accepts a function, like
<a href=/documentation/transforms/python/elementwise/map><code>Map</code></a>,
can take side inputs.
If we only need a single value, we can use
<a href=https://beam.apache.org/releases/pydoc/current/apache_beam.pvalue.html#apache_beam.pvalue.AsSingleton><code>beam.pvalue.AsSingleton</code></a> and access them as a Python value.
If we need multiple values, we can use
<a href=https://beam.apache.org/releases/pydoc/current/apache_beam.pvalue.html#apache_beam.pvalue.AsIter><code>beam.pvalue.AsIter</code></a>
and access them as an <a href=https://docs.python.org/3/glossary.html#term-iterable><code>iterable</code></a>.</p><div class='language-py snippet'><div class="notebook-skip code-snippet"><a class=copy type=button data-bs-toggle=tooltip data-bs-placement=bottom title="Copy to clipboard"><img src=/images/copy-icon.svg></a><div class=highlight><pre tabindex=0 class=chroma><code class=language-py data-lang=py><span class=line><span class=cl><span class=kn>import</span> <span class=nn>apache_beam</span> <span class=k>as</span> <span class=nn>beam</span>
</span></span><span class=line><span class=cl>
</span></span><span class=line><span class=cl><span class=k>with</span> <span class=n>beam</span><span class=o>.</span><span class=n>Pipeline</span><span class=p>()</span> <span class=k>as</span> <span class=n>pipeline</span><span class=p>:</span>
</span></span><span class=line><span class=cl> <span class=n>values</span> <span class=o>=</span> <span class=n>pipeline</span> <span class=o>|</span> <span class=n>beam</span><span class=o>.</span><span class=n>Create</span><span class=p>([</span><span class=mi>1</span><span class=p>,</span> <span class=mi>2</span><span class=p>,</span> <span class=mi>3</span><span class=p>,</span> <span class=mi>4</span><span class=p>])</span>
</span></span><span class=line><span class=cl> <span class=n>min_value</span> <span class=o>=</span> <span class=n>values</span> <span class=o>|</span> <span class=n>beam</span><span class=o>.</span><span class=n>CombineGlobally</span><span class=p>(</span><span class=nb>min</span><span class=p>)</span>
</span></span><span class=line><span class=cl> <span class=n>max_value</span> <span class=o>=</span> <span class=n>values</span> <span class=o>|</span> <span class=n>beam</span><span class=o>.</span><span class=n>CombineGlobally</span><span class=p>(</span><span class=nb>max</span><span class=p>)</span>
</span></span><span class=line><span class=cl>
</span></span><span class=line><span class=cl> <span class=c1># To access `min_value` and `max_value`, we need to pass them as a side input.</span>
</span></span><span class=line><span class=cl> <span class=n>scaled_values</span> <span class=o>=</span> <span class=n>values</span> <span class=o>|</span> <span class=n>beam</span><span class=o>.</span><span class=n>Map</span><span class=p>(</span>
</span></span><span class=line><span class=cl> <span class=k>lambda</span> <span class=n>x</span><span class=p>,</span> <span class=n>minimum</span><span class=p>,</span> <span class=n>maximum</span><span class=p>:</span> <span class=p>(</span><span class=n>x</span> <span class=o>-</span> <span class=n>minimum</span><span class=p>)</span> <span class=o>/</span> <span class=p>(</span><span class=n>maximum</span> <span class=o>-</span> <span class=n>minimum</span><span class=p>),</span>
</span></span><span class=line><span class=cl> <span class=n>minimum</span><span class=o>=</span><span class=n>beam</span><span class=o>.</span><span class=n>pvalue</span><span class=o>.</span><span class=n>AsSingleton</span><span class=p>(</span><span class=n>min_value</span><span class=p>),</span>
</span></span><span class=line><span class=cl> <span class=n>maximum</span><span class=o>=</span><span class=n>beam</span><span class=o>.</span><span class=n>pvalue</span><span class=o>.</span><span class=n>AsSingleton</span><span class=p>(</span><span class=n>max_value</span><span class=p>),</span>
</span></span><span class=line><span class=cl> <span class=p>)</span>
</span></span><span class=line><span class=cl>
</span></span><span class=line><span class=cl> <span class=n>scaled_values</span> <span class=o>|</span> <span class=n>beam</span><span class=o>.</span><span class=n>Map</span><span class=p>(</span><span class=nb>print</span><span class=p>)</span></span></span></code></pre></div></div></div><blockquote><p>ℹ️ In Beam we need to pass a side input explicitly, but we get the
benefit that a reduction or aggregation does <em>not</em> have to fit into memory.
Lazily computing side inputs also allows us to compute <code>values</code> only once,
rather than for each distinct reduction (or requiring explicit caching of the RDD).</p></blockquote><h2 id=next-steps>Next Steps</h2><ul><li>Take a look at all the available transforms in the <a href=/documentation/transforms/python/overview>Python transform gallery</a>.</li><li>Learn how to read from and write to files in the <a href=/documentation/programming-guide/#pipeline-io><em>Pipeline I/O</em> section of the <em>Programming guide</em></a></li><li>Walk through additional WordCount examples in the <a href=/get-started/wordcount-example>WordCount Example Walkthrough</a>.</li><li>Take a self-paced tour through our <a href=/documentation/resources/learning-resources>Learning Resources</a>.</li><li>Dive in to some of our favorite <a href=/get-started/resources/videos-and-podcasts>Videos and Podcasts</a>.</li><li>Join the Beam <a href=/community/contact-us>users@</a> mailing list.</li><li>If you&rsquo;re interested in contributing to the Apache Beam codebase, see the <a href=/contribute>Contribution Guide</a>.</li></ul><p>Please don&rsquo;t hesitate to <a href=/community/contact-us>reach out</a> if you encounter any issues!</p><div class=feedback><p class=update>Last updated on 2024/05/03</p><h3>Have you found everything you were looking for?</h3><p class=description>Was it all useful and clear? Is there anything that you would like to change? Let us know!</p><button class=load-button><a href="https://docs.google.com/forms/d/e/1FAIpQLSfID7abne3GE6k6RdJIyZhPz2Gef7UkpggUEhTIDjjplHuxSA/viewform?usp=header_link" target=_blank>SEND FEEDBACK</a></button></div></div></div><footer class=footer><div class=footer__contained><div class=footer__cols><div class="footer__cols__col footer__cols__col__logos"><div class=footer__cols__col__logo><img src=/images/beam_logo_circle.svg class=footer__logo alt="Beam logo"></div><div class=footer__cols__col__logo><img src=/images/apache_logo_circle.svg class=footer__logo alt="Apache logo"></div></div><div class=footer-wrapper><div class=wrapper-grid><div class=footer__cols__col><div class=footer__cols__col__title>Start</div><div class=footer__cols__col__link><a href=/get-started/beam-overview/>Overview</a></div><div class=footer__cols__col__link><a href=/get-started/quickstart-java/>Quickstart (Java)</a></div><div class=footer__cols__col__link><a href=/get-started/quickstart-py/>Quickstart (Python)</a></div><div class=footer__cols__col__link><a href=/get-started/quickstart-go/>Quickstart (Go)</a></div><div class=footer__cols__col__link><a href=/get-started/downloads/>Downloads</a></div></div><div class=footer__cols__col><div class=footer__cols__col__title>Docs</div><div class=footer__cols__col__link><a href=/documentation/programming-guide/>Concepts</a></div><div class=footer__cols__col__link><a href=/documentation/pipelines/design-your-pipeline/>Pipelines</a></div><div class=footer__cols__col__link><a href=/documentation/runners/capability-matrix/>Runners</a></div></div><div class=footer__cols__col><div class=footer__cols__col__title>Community</div><div class=footer__cols__col__link><a href=/contribute/>Contribute</a></div><div class=footer__cols__col__link><a href=https://projects.apache.org/committee.html?beam target=_blank>Team<img src=/images/external-link-icon.png width=14 height=14 alt="External link."></a></div><div class=footer__cols__col__link><a href=/community/presentation-materials/>Media</a></div><div class=footer__cols__col__link><a href=/community/in-person/>Events/Meetups</a></div><div class=footer__cols__col__link><a href=/community/contact-us/>Contact Us</a></div></div><div class=footer__cols__col><div class=footer__cols__col__title>Resources</div><div class=footer__cols__col__link><a href=/blog/>Blog</a></div><div class=footer__cols__col__link><a href=https://github.com/apache/beam>GitHub</a></div></div></div><div class=footer__bottom>&copy;
<a href=https://www.apache.org>The Apache Software Foundation</a>
| <a href=/privacy_policy>Privacy Policy</a>
| <a href=/feed.xml>RSS Feed</a><br><br>Apache Beam, Apache, Beam, the Beam logo, and the Apache feather logo are either registered trademarks or trademarks of The Apache Software Foundation. All other products or name brands are trademarks of their respective holders, including The Apache Software Foundation.</div></div><div class="footer__cols__col footer__cols__col__logos"><div class=footer__cols__col--group><div class=footer__cols__col__logo><a href=https://github.com/apache/beam><img src=/images/logos/social-icons/github-logo-150.png class=footer__logo alt="Github logo"></a></div><div class=footer__cols__col__logo><a href=https://www.linkedin.com/company/apache-beam/><img src=/images/logos/social-icons/linkedin-logo-150.png class=footer__logo alt="Linkedin logo"></a></div></div><div class=footer__cols__col--group><div class=footer__cols__col__logo><a href=https://twitter.com/apachebeam><img src=/images/logos/social-icons/twitter-logo-150.png class=footer__logo alt="Twitter logo"></a></div><div class=footer__cols__col__logo><a href=https://www.youtube.com/channel/UChNnb_YO_7B0HlW6FhAXZZQ><img src=/images/logos/social-icons/youtube-logo-150.png class=footer__logo alt="Youtube logo"></a></div></div></div></div></div></footer></body></html>