| |
| <!DOCTYPE html> |
| <html lang="en"> |
| <head> |
| <meta charset="utf-8"> |
| <title>Apache Zeppelin 0.10.1 Documentation: Beam interpreter in Apache Zeppelin</title> |
| <meta name="description" content="Apache Beam is an open source, unified programming model that you can use to create a data processing pipeline."> |
| <meta name="author" content="The Apache Software Foundation"> |
| |
| <!-- Enable responsive viewport --> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| |
| <!-- Le HTML5 shim, for IE6-8 support of HTML elements --> |
| <!--[if lt IE 9]> |
| <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script> |
| <![endif]--> |
| |
| <link href="/docs/0.10.1/assets/themes/zeppelin/font-awesome.min.css" rel="stylesheet"> |
| |
| <!-- Le styles --> |
| <link href="/docs/0.10.1/assets/themes/zeppelin/bootstrap/css/bootstrap.css" rel="stylesheet"> |
| <link href="/docs/0.10.1/assets/themes/zeppelin/css/style.css?body=1" rel="stylesheet" type="text/css"> |
| <link href="/docs/0.10.1/assets/themes/zeppelin/css/syntax.css" rel="stylesheet" type="text/css" media="screen" /> |
| <!-- Le fav and touch icons --> |
| <!-- Update these with your own images |
| <link rel="shortcut icon" href="images/favicon.ico"> |
| <link rel="apple-touch-icon" href="images/apple-touch-icon.png"> |
| <link rel="apple-touch-icon" sizes="72x72" href="images/apple-touch-icon-72x72.png"> |
| <link rel="apple-touch-icon" sizes="114x114" href="images/apple-touch-icon-114x114.png"> |
| --> |
| |
| <!-- Js --> |
| <script src="/docs/0.10.1/assets/themes/zeppelin/jquery-1.10.2.min.js"></script> |
| <script src="/docs/0.10.1/assets/themes/zeppelin/bootstrap/js/bootstrap.min.js"></script> |
| <script src="/docs/0.10.1/assets/themes/zeppelin/js/docs.js"></script> |
| <script src="/docs/0.10.1/assets/themes/zeppelin/js/anchor.min.js"></script> |
| <script src="/docs/0.10.1/assets/themes/zeppelin/js/toc.js"></script> |
| <script src="/docs/0.10.1/assets/themes/zeppelin/js/lunr.min.js"></script> |
| <script src="/docs/0.10.1/assets/themes/zeppelin/js/search.js"></script> |
| |
| <!-- atom & rss feed --> |
| <link href="/docs/0.10.1/atom.xml" type="application/atom+xml" rel="alternate" title="Sitewide ATOM Feed"> |
| <link href="/docs/0.10.1/rss.xml" type="application/rss+xml" rel="alternate" title="Sitewide RSS Feed"> |
| |
| <!-- Matomo --> |
| <script> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["setDoNotTrack", true]); |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '69']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| </head> |
| |
| <body> |
| |
| <div id="menu" class="navbar navbar-inverse navbar-fixed-top" role="navigation"> |
| <div class="container navbar-container"> |
| <div class="navbar-header"> |
| <button type="button" class="navbar-toggle" data-toggle="collapse" data-target=".navbar-collapse"> |
| <span class="sr-only">Toggle navigation</span> |
| <span class="icon-bar"></span> |
| <span class="icon-bar"></span> |
| <span class="icon-bar"></span> |
| </button> |
| <div class="navbar-brand"> |
| <a class="navbar-brand-main" href="http://zeppelin.apache.org"> |
| <img src="/docs/0.10.1/assets/themes/zeppelin/img/zeppelin_logo.png" width="50" |
| style="margin-top: -2px;" alt="I'm zeppelin"> |
| <span style="margin-left: 5px; font-size: 27px;">Zeppelin</span> |
| <a class="navbar-brand-version" href="/docs/0.10.1" |
| style="font-size: 15px; color: white;"> 0.10.1 |
| </a> |
| </a> |
| </div> |
| </div> |
| <nav class="navbar-collapse collapse" role="navigation"> |
| <ul class="nav navbar-nav"> |
| <li> |
| <a href="#" data-toggle="dropdown" class="dropdown-toggle">Quick Start <b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li class="title"><span>Getting Started</span></li> |
| <li><a href="/docs/0.10.1/quickstart/install.html">Install</a></li> |
| <li><a href="/docs/0.10.1/quickstart/explore_ui.html">Explore UI</a></li> |
| <li><a href="/docs/0.10.1/quickstart/tutorial.html">Tutorial</a></li> |
| <li role="separator" class="divider"></li> |
| <li class="title"><span>Run Mode</span></li> |
| <li><a href="/docs/0.10.1/quickstart/kubernetes.html">Kubernetes</a></li> |
| <li><a href="/docs/0.10.1/quickstart/docker.html">Docker</a></li> |
| <li><a href="/docs/0.10.1/quickstart/yarn.html">Yarn</a></li> |
| <li role="separator" class="divider"></li> |
| <li><a href="/docs/0.10.1/quickstart/spark_with_zeppelin.html">Spark with Zeppelin</a></li> |
| <li><a href="/docs/0.10.1/quickstart/flink_with_zeppelin.html">Flink with Zeppelin</a></li> |
| <li><a href="/docs/0.10.1/quickstart/sql_with_zeppelin.html">SQL with Zeppelin</a></li> |
| <li><a href="/docs/0.10.1/quickstart/python_with_zeppelin.html">Python with Zeppelin</a></li> |
| <li><a href="/docs/0.10.1/quickstart/r_with_zeppelin.html">R with Zeppelin</a></li> |
| </ul> |
| </li> |
| |
| <li> |
| <a href="#" data-toggle="dropdown" class="dropdown-toggle">Usage<b class="caret"></b></a> |
| <ul class="dropdown-menu scrollable-menu"> |
| <li class="title"><span>Dynamic Form</span></li> |
| <li><a href="/docs/0.10.1/usage/dynamic_form/intro.html">What is Dynamic Form?</a></li> |
| <li role="separator" class="divider"></li> |
| <li class="title"><span>Display System</span></li> |
| <li><a href="/docs/0.10.1/usage/display_system/basic.html#text">Text Display</a></li> |
| <li><a href="/docs/0.10.1/usage/display_system/basic.html#html">HTML Display</a></li> |
| <li><a href="/docs/0.10.1/usage/display_system/basic.html#table">Table Display</a></li> |
| <li><a href="/docs/0.10.1/usage/display_system/basic.html#network">Network Display</a></li> |
| <li><a href="/docs/0.10.1/usage/display_system/angular_backend.html">Angular Display using Backend API</a></li> |
| <li><a href="/docs/0.10.1/usage/display_system/angular_frontend.html">Angular Display using Frontend API</a></li> |
| <li role="separator" class="divider"></li> |
| <li class="title"><span>Interpreter</span></li> |
| <li><a href="/docs/0.10.1/usage/interpreter/overview.html">Overview</a></li> |
| <li><a href="/docs/0.10.1/usage/interpreter/interpreter_binding_mode.html">Interpreter Binding Mode</a></li> |
| <li><a href="/docs/0.10.1/usage/interpreter/user_impersonation.html">User Impersonation</a></li> |
| <li><a href="/docs/0.10.1/usage/interpreter/dependency_management.html">Dependency Management</a></li> |
| <li><a href="/docs/0.10.1/usage/interpreter/installation.html">Installing Interpreters</a></li> |
| <!--<li><a href="/docs/0.10.1/usage/interpreter/dynamic_loading.html">Dynamic Interpreter Loading (Experimental)</a></li>--> |
| <li><a href="/docs/0.10.1/usage/interpreter/execution_hooks.html">Execution Hooks (Experimental)</a></li> |
| <li role="separator" class="divider"></li> |
| <li class="title"><span>Other Features</span></li> |
| <li><a href="/docs/0.10.1/usage/other_features/publishing_paragraphs.html">Publishing Paragraphs</a></li> |
| <li><a href="/docs/0.10.1/usage/other_features/personalized_mode.html">Personalized Mode</a></li> |
| <li><a href="/docs/0.10.1/usage/other_features/customizing_homepage.html">Customizing Zeppelin Homepage</a></li> |
| <li><a href="/docs/0.10.1/usage/other_features/notebook_actions.html">Notebook Actions</a></li> |
| <li><a href="/docs/0.10.1/usage/other_features/cron_scheduler.html">Cron Scheduler</a></li> |
| <li><a href="/docs/0.10.1/usage/other_features/zeppelin_context.html">Zeppelin Context</a></li> |
| <li role="separator" class="divider"></li> |
| <li class="title"><span>REST API</span></li> |
| <li><a href="/docs/0.10.1/usage/rest_api/interpreter.html">Interpreter API</a></li> |
| <li><a href="/docs/0.10.1/usage/rest_api/zeppelin_server.html">Zeppelin Server API</a></li> |
| <li><a href="/docs/0.10.1/usage/rest_api/notebook.html">Notebook API</a></li> |
| <li><a href="/docs/0.10.1/usage/rest_api/notebook_repository.html">Notebook Repository API</a></li> |
| <li><a href="/docs/0.10.1/usage/rest_api/configuration.html">Configuration API</a></li> |
| <li><a href="/docs/0.10.1/usage/rest_api/credential.html">Credential API</a></li> |
| <li><a href="/docs/0.10.1/usage/rest_api/helium.html">Helium API</a></li> |
| <li class="title"><span>Zeppelin SDK</span></li> |
| <li><a href="/docs/0.10.1/usage/zeppelin_sdk/client_api.html">Client API</a></li> |
| <li><a href="/docs/0.10.1/usage/zeppelin_sdk/session_api.html">Session API</a></li> |
| </ul> |
| </li> |
| |
| <li> |
| <a href="#" data-toggle="dropdown" class="dropdown-toggle">Setup<b class="caret"></b></a> |
| <ul class="dropdown-menu scrollable-menu"> |
| <li class="title"><span>Basics</span></li> |
| <li><a href="/docs/0.10.1/setup/basics/how_to_build.html">How to Build Zeppelin</a></li> |
| <li><a href="/docs/0.10.1/setup/basics/hadoop_integration.html">Hadoop Integration</a></li> |
| <li><a href="/docs/0.10.1/setup/basics/multi_user_support.html">Multi-user Support</a></li> |
| <li role="separator" class="divider"></li> |
| <li class="title"><span>Deployment</span></li> |
| <!--<li><a href="/docs/0.10.1/setup/deployment/docker.html">Docker Image for Zeppelin</a></li>--> |
| <li><a href="/docs/0.10.1/setup/deployment/spark_cluster_mode.html#spark-standalone-mode">Spark Cluster Mode: Standalone</a></li> |
| <li><a href="/docs/0.10.1/setup/deployment/spark_cluster_mode.html#spark-on-yarn-mode">Spark Cluster Mode: YARN</a></li> |
| <li><a href="/docs/0.10.1/setup/deployment/spark_cluster_mode.html#spark-on-mesos-mode">Spark Cluster Mode: Mesos</a></li> |
| <li><a href="/docs/0.10.1/setup/deployment/flink_and_spark_cluster.html">Zeppelin with Flink, Spark Cluster</a></li> |
| <li><a href="/docs/0.10.1/setup/deployment/cdh.html">Zeppelin on CDH</a></li> |
| <li><a href="/docs/0.10.1/setup/deployment/virtual_machine.html">Zeppelin on VM: Vagrant</a></li> |
| <li role="separator" class="divider"></li> |
| <li class="title"><span>Security</span></li> |
| <li><a href="/docs/0.10.1/setup/security/authentication_nginx.html">HTTP Basic Auth using NGINX</a></li> |
| <li><a href="/docs/0.10.1/setup/security/shiro_authentication.html">Shiro Authentication</a></li> |
| <li><a href="/docs/0.10.1/setup/security/notebook_authorization.html">Notebook Authorization</a></li> |
| <li><a href="/docs/0.10.1/setup/security/datasource_authorization.html">Data Source Authorization</a></li> |
| <li><a href="/docs/0.10.1/setup/security/http_security_headers.html">HTTP Security Headers</a></li> |
| <li role="separator" class="divider"></li> |
| <li class="title"><span>Notebook Storage</span></li> |
| <li><a href="/docs/0.10.1/setup/storage/storage.html#notebook-storage-in-local-git-repository">Git Storage</a></li> |
| <li><a href="/docs/0.10.1/setup/storage/storage.html#notebook-storage-in-s3">S3 Storage</a></li> |
| <li><a href="/docs/0.10.1/setup/storage/storage.html#notebook-storage-in-azure">Azure Storage</a></li> |
| <li><a href="/docs/0.10.1/setup/storage/storage.html#notebook-storage-in-oss">OSS Storage</a></li> |
| <li><a href="/docs/0.10.1/setup/storage/storage.html#notebook-storage-in-zeppelinhub">ZeppelinHub Storage</a></li> |
| <li><a href="/docs/0.10.1/setup/storage/storage.html#notebook-storage-in-mongodb">MongoDB Storage</a></li> |
| <li role="separator" class="divider"></li> |
| <li class="title"><span>Operation</span></li> |
| <li><a href="/docs/0.10.1/setup/operation/configuration.html">Configuration</a></li> |
| <li><a href="/docs/0.10.1/setup/operation/proxy_setting.html">Proxy Setting</a></li> |
| <li><a href="/docs/0.10.1/setup/operation/upgrading.html">Upgrading</a></li> |
| <li><a href="/docs/0.10.1/setup/operation/trouble_shooting.html">Trouble Shooting</a></li> |
| </ul> |
| </li> |
| |
| <li> |
| <a href="#" data-toggle="dropdown" class="dropdown-toggle">Interpreter <b class="caret"></b></a> |
| <ul class="dropdown-menu scrollable-menu"> |
| <li class="title"><span>Interpreters</span></li> |
| <li><a href="/docs/0.10.1/usage/interpreter/overview.html">Overview</a></li> |
| <li role="separator" class="divider"></li> |
| <li><a href="/docs/0.10.1/interpreter/spark.html">Spark</a></li> |
| <li><a href="/docs/0.10.1/interpreter/flink.html">Flink</a></li> |
| <li><a href="/docs/0.10.1/interpreter/jdbc.html">JDBC</a></li> |
| <li><a href="/docs/0.10.1/interpreter/python.html">Python</a></li> |
| <li><a href="/docs/0.10.1/interpreter/r.html">R</a></li> |
| <li role="separator" class="divider"></li> |
| <li><a href="/docs/0.10.1/interpreter/alluxio.html">Alluxio</a></li> |
| <li><a href="/docs/0.10.1/interpreter/beam.html">Beam</a></li> |
| <li><a href="/docs/0.10.1/interpreter/bigquery.html">BigQuery</a></li> |
| <li><a href="/docs/0.10.1/interpreter/cassandra.html">Cassandra</a></li> |
| <li><a href="/docs/0.10.1/interpreter/elasticsearch.html">Elasticsearch</a></li> |
| <li><a href="/docs/0.10.1/interpreter/geode.html">Geode</a></li> |
| <li><a href="/docs/0.10.1/interpreter/groovy.html">Groovy</a></li> |
| <li><a href="/docs/0.10.1/interpreter/hazelcastjet.html">Hazelcast Jet</a></li> |
| <li><a href="/docs/0.10.1/interpreter/hbase.html">HBase</a></li> |
| <li><a href="/docs/0.10.1/interpreter/hdfs.html">HDFS</a></li> |
| <li><a href="/docs/0.10.1/interpreter/hive.html">Hive</a></li> |
| <li><a href="/docs/0.10.1/interpreter/ignite.html">Ignite</a></li> |
| <li><a href="/docs/0.10.1/interpreter/influxdb.html">influxDB</a></li> |
| <li><a href="/docs/0.10.1/interpreter/java.html">Java</a></li> |
| <li><a href="/docs/0.10.1/interpreter/jupyter.html">Jupyter</a></li> |
| <li><a href="/docs/0.10.1/interpreter/kotlin.html">Kotlin</a></li> |
| <li><a href="/docs/0.10.1/interpreter/ksql.html">KSQL</a></li> |
| <li><a href="/docs/0.10.1/interpreter/kylin.html">Kylin</a></li> |
| <li><a href="/docs/0.10.1/interpreter/lens.html">Lens</a></li> |
| <li><a href="/docs/0.10.1/interpreter/livy.html">Livy</a></li> |
| <li><a href="/docs/0.10.1/interpreter/mahout.html">Mahout</a></li> |
| <li><a href="/docs/0.10.1/interpreter/markdown.html">Markdown</a></li> |
| <li><a href="/docs/0.10.1/interpreter/mongodb.html">MongoDB</a></li> |
| <li><a href="/docs/0.10.1/interpreter/neo4j.html">Neo4j</a></li> |
| <li><a href="/docs/0.10.1/interpreter/pig.html">Pig</a></li> |
| <li><a href="/docs/0.10.1/interpreter/postgresql.html">Postgresql, HAWQ</a></li> |
| <li><a href="/docs/0.10.1/interpreter/sap.html">SAP</a></li> |
| <li><a href="/docs/0.10.1/interpreter/scalding.html">Scalding</a></li> |
| <li><a href="/docs/0.10.1/interpreter/scio.html">Scio</a></li> |
| <li><a href="/docs/0.10.1/interpreter/shell.html">Shell</a></li> |
| <li><a href="/docs/0.10.1/interpreter/sparql.html">Sparql</a></li> |
| <li><a href="/docs/0.10.1/interpreter/submarine.html">Submarine</a></li> |
| </ul> |
| </li> |
| <li> |
| <a href="#" data-toggle="dropdown" class="dropdown-toggle">More<b class="caret"></b></a> |
| <ul class="dropdown-menu scrollable-menu" style="right: 0; left: auto;"> |
| <li class="title"><span>Extending Zeppelin</span></li> |
| <li><a href="/docs/0.10.1/development/writing_zeppelin_interpreter.html">Writing Zeppelin Interpreter</a></li> |
| <li role="separator" class="divider"></li> |
| <li class="title"><span>Helium (Experimental)</span></li> |
| <li><a href="/docs/0.10.1/development/helium/overview.html">Overview</a></li> |
| <li><a href="/docs/0.10.1/development/helium/writing_application.html">Writing Helium Application</a></li> |
| <li><a href="/docs/0.10.1/development/helium/writing_spell.html">Writing Helium Spell</a></li> |
| <li><a href="/docs/0.10.1/development/helium/writing_visualization_basic.html">Writing Helium Visualization: Basics</a></li> |
| <li><a href="/docs/0.10.1/development/helium/writing_visualization_transformation.html">Writing Helium Visualization: Transformation</a></li> |
| <li role="separator" class="divider"></li> |
| <li class="title"><span>Contributing to Zeppelin</span></li> |
| <li><a href="/docs/0.10.1/setup/basics/how_to_build.html">How to Build Zeppelin</a></li> |
| <li><a href="/docs/0.10.1/development/contribution/useful_developer_tools.html">Useful Developer Tools</a></li> |
| <li><a href="/docs/0.10.1/development/contribution/how_to_contribute_code.html">How to Contribute (code)</a></li> |
| <li><a href="/docs/0.10.1/development/contribution/how_to_contribute_website.html">How to Contribute (website)</a></li> |
| <li role="separator" class="divider"></li> |
| <li class="title"><span>External Resources</span></li> |
| <li><a target="_blank" rel="noopener noreferrer" href="https://zeppelin.apache.org/community.html">Mailing List</a></li> |
| <li><a target="_blank" rel="noopener noreferrer" href="https://cwiki.apache.org/confluence/display/ZEPPELIN/Zeppelin+Home">Apache Zeppelin Wiki</a></li> |
| <li><a target="_blank" rel="noopener noreferrer" href="http://stackoverflow.com/questions/tagged/apache-zeppelin">Stackoverflow Questions about Zeppelin</a></li> |
| </ul> |
| </li> |
| <li> |
| <a href="/docs/0.10.1/search.html" class="nav-search-link"> |
| <span class="fa fa-search nav-search-icon"></span> |
| </a> |
| </li> |
| </ul> |
| </nav><!--/.navbar-collapse --> |
| </div> |
| </div> |
| |
| |
| |
| <div class="content"> |
| |
| <!--<div class="hero-unit Beam interpreter in Apache Zeppelin"> |
| <h1></h1> |
| </div> |
| --> |
| |
| <div class="row"> |
| <div class="col-md-12"> |
| <!-- |
| Licensed under the Apache License, Version 2.0 (the "License"); |
| you may not use this file except in compliance with the License. |
| You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| --> |
| |
| <h1>Beam interpreter for Apache Zeppelin</h1> |
| |
| <div id="toc"></div> |
| |
| <h2>Overview</h2> |
| |
| <p><a href="http://beam.incubator.apache.org">Apache Beam</a> is an open source unified platform for data processing pipelines. A pipeline can be build using one of the Beam SDKs. |
| The execution of the pipeline is done by different Runners. Currently, Beam supports Apache Flink Runner, Apache Spark Runner, and Google Dataflow Runner.</p> |
| |
| <h2>How to use</h2> |
| |
| <p>Basically, you can write normal Beam java code where you can determine the Runner. You should write the main method inside a class becuase the interpreter invoke this main to execute the pipeline. Unlike Zeppelin normal pattern, each paragraph is considered as a separate job, there isn't any relation to any other paragraph.</p> |
| |
| <p>The following is a demonstration of a word count example with data represented in array of strings |
| But it can read data from files by replacing <code>Create.of(SENTENCES).withCoder(StringUtf8Coder.of())</code> with <code>TextIO.Read.from("path/to/filename.txt")</code></p> |
| <div class="highlight"><pre><code class="java language-java" data-lang="java"><span class="o">%</span><span class="n">beam</span> |
| |
| <span class="c1">// most used imports</span> |
| <span class="kn">import</span> <span class="nn">org.apache.beam.sdk.coders.StringUtf8Coder</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.beam.sdk.transforms.Create</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.io.Serializable</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.Arrays</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.ArrayList</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.beam.runners.direct.*</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.beam.sdk.runners.*</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.beam.sdk.options.*</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.beam.runners.flink.*</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.beam.sdk.Pipeline</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.beam.sdk.io.TextIO</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.beam.sdk.options.PipelineOptionsFactory</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.beam.sdk.transforms.Count</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.beam.sdk.transforms.DoFn</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.beam.sdk.transforms.MapElements</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.beam.sdk.transforms.ParDo</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.beam.sdk.transforms.SimpleFunction</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.beam.sdk.values.KV</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.beam.sdk.options.PipelineOptions</span><span class="o">;</span> |
| |
| <span class="kd">public</span> <span class="kd">class</span> <span class="nc">MinimalWordCount</span> <span class="o">{</span> |
| <span class="kd">static</span> <span class="n">List</span><span class="o"><</span><span class="n">String</span><span class="o">></span> <span class="n">s</span> <span class="o">=</span> <span class="k">new</span> <span class="n">ArrayList</span><span class="o"><>();</span> |
| |
| <span class="kd">static</span> <span class="kd">final</span> <span class="n">String</span><span class="o">[]</span> <span class="n">SENTENCES_ARRAY</span> <span class="o">=</span> <span class="k">new</span> <span class="n">String</span><span class="o">[]</span> <span class="o">{</span> |
| <span class="s">"Hadoop is the Elephant King!"</span><span class="o">,</span> |
| <span class="s">"A yellow and elegant thing."</span><span class="o">,</span> |
| <span class="s">"He never forgets"</span><span class="o">,</span> |
| <span class="s">"Useful data, or lets"</span><span class="o">,</span> |
| <span class="s">"An extraneous element cling!"</span><span class="o">,</span> |
| <span class="s">"A wonderful king is Hadoop."</span><span class="o">,</span> |
| <span class="s">"The elephant plays well with Sqoop."</span><span class="o">,</span> |
| <span class="s">"But what helps him to thrive"</span><span class="o">,</span> |
| <span class="s">"Are Impala, and Hive,"</span><span class="o">,</span> |
| <span class="s">"And HDFS in the group."</span><span class="o">,</span> |
| <span class="s">"Hadoop is an elegant fellow."</span><span class="o">,</span> |
| <span class="s">"An elephant gentle and mellow."</span><span class="o">,</span> |
| <span class="s">"He never gets mad,"</span><span class="o">,</span> |
| <span class="s">"Or does anything bad,"</span><span class="o">,</span> |
| <span class="s">"Because, at his core, he is yellow"</span><span class="o">,</span> |
| <span class="o">};</span> |
| <span class="kd">static</span> <span class="kd">final</span> <span class="n">List</span><span class="o"><</span><span class="n">String</span><span class="o">></span> <span class="n">SENTENCES</span> <span class="o">=</span> <span class="n">Arrays</span><span class="o">.</span><span class="na">asList</span><span class="o">(</span><span class="n">SENTENCES_ARRAY</span><span class="o">);</span> |
| <span class="kd">public</span> <span class="kd">static</span> <span class="kt">void</span> <span class="nf">main</span><span class="o">(</span><span class="n">String</span><span class="o">[]</span> <span class="n">args</span><span class="o">)</span> <span class="o">{</span> |
| <span class="n">PipelineOptions</span> <span class="n">options</span> <span class="o">=</span> <span class="n">PipelineOptionsFactory</span><span class="o">.</span><span class="na">create</span><span class="o">().</span><span class="na">as</span><span class="o">(</span><span class="n">PipelineOptions</span><span class="o">.</span><span class="na">class</span><span class="o">);</span> |
| <span class="n">options</span><span class="o">.</span><span class="na">setRunner</span><span class="o">(</span><span class="n">FlinkRunner</span><span class="o">.</span><span class="na">class</span><span class="o">);</span> |
| <span class="n">Pipeline</span> <span class="n">p</span> <span class="o">=</span> <span class="n">Pipeline</span><span class="o">.</span><span class="na">create</span><span class="o">(</span><span class="n">options</span><span class="o">);</span> |
| <span class="n">p</span><span class="o">.</span><span class="na">apply</span><span class="o">(</span><span class="n">Create</span><span class="o">.</span><span class="na">of</span><span class="o">(</span><span class="n">SENTENCES</span><span class="o">).</span><span class="na">withCoder</span><span class="o">(</span><span class="n">StringUtf8Coder</span><span class="o">.</span><span class="na">of</span><span class="o">()))</span> |
| <span class="o">.</span><span class="na">apply</span><span class="o">(</span><span class="s">"ExtractWords"</span><span class="o">,</span> <span class="n">ParDo</span><span class="o">.</span><span class="na">of</span><span class="o">(</span><span class="k">new</span> <span class="n">DoFn</span><span class="o"><</span><span class="n">String</span><span class="o">,</span> <span class="n">String</span><span class="o">>()</span> <span class="o">{</span> |
| <span class="nd">@ProcessElement</span> |
| <span class="kd">public</span> <span class="kt">void</span> <span class="nf">processElement</span><span class="o">(</span><span class="n">ProcessContext</span> <span class="n">c</span><span class="o">)</span> <span class="o">{</span> |
| <span class="k">for</span> <span class="o">(</span><span class="n">String</span> <span class="n">word</span> <span class="o">:</span> <span class="n">c</span><span class="o">.</span><span class="na">element</span><span class="o">().</span><span class="na">split</span><span class="o">(</span><span class="s">"[^a-zA-Z']+"</span><span class="o">))</span> <span class="o">{</span> |
| <span class="k">if</span> <span class="o">(!</span><span class="n">word</span><span class="o">.</span><span class="na">isEmpty</span><span class="o">())</span> <span class="o">{</span> |
| <span class="n">c</span><span class="o">.</span><span class="na">output</span><span class="o">(</span><span class="n">word</span><span class="o">);</span> |
| <span class="o">}</span> |
| <span class="o">}</span> |
| <span class="o">}</span> |
| <span class="o">}))</span> |
| <span class="o">.</span><span class="na">apply</span><span class="o">(</span><span class="n">Count</span><span class="o">.<</span><span class="n">String</span><span class="o">></span> <span class="n">perElement</span><span class="o">())</span> |
| <span class="o">.</span><span class="na">apply</span><span class="o">(</span><span class="s">"FormatResults"</span><span class="o">,</span> <span class="n">ParDo</span><span class="o">.</span><span class="na">of</span><span class="o">(</span><span class="k">new</span> <span class="n">DoFn</span><span class="o"><</span><span class="n">KV</span><span class="o"><</span><span class="n">String</span><span class="o">,</span> <span class="n">Long</span><span class="o">>,</span> <span class="n">String</span><span class="o">>()</span> <span class="o">{</span> |
| <span class="nd">@ProcessElement</span> |
| <span class="kd">public</span> <span class="kt">void</span> <span class="nf">processElement</span><span class="o">(</span><span class="n">DoFn</span><span class="o"><</span><span class="n">KV</span><span class="o"><</span><span class="n">String</span><span class="o">,</span> <span class="n">Long</span><span class="o">>,</span> <span class="n">String</span><span class="o">>.</span><span class="na">ProcessContext</span> <span class="n">arg0</span><span class="o">)</span> |
| <span class="kd">throws</span> <span class="n">Exception</span> <span class="o">{</span> |
| <span class="n">s</span><span class="o">.</span><span class="na">add</span><span class="o">(</span><span class="s">"\n"</span> <span class="o">+</span> <span class="n">arg0</span><span class="o">.</span><span class="na">element</span><span class="o">().</span><span class="na">getKey</span><span class="o">()</span> <span class="o">+</span> <span class="s">"\t"</span> <span class="o">+</span> <span class="n">arg0</span><span class="o">.</span><span class="na">element</span><span class="o">().</span><span class="na">getValue</span><span class="o">());</span> |
| <span class="o">}</span> |
| <span class="o">}));</span> |
| <span class="n">p</span><span class="o">.</span><span class="na">run</span><span class="o">();</span> |
| <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="s">"%table word\tcount"</span><span class="o">);</span> |
| <span class="k">for</span> <span class="o">(</span><span class="kt">int</span> <span class="n">i</span> <span class="o">=</span> <span class="mi">0</span><span class="o">;</span> <span class="n">i</span> <span class="o"><</span> <span class="n">s</span><span class="o">.</span><span class="na">size</span><span class="o">();</span> <span class="n">i</span><span class="o">++)</span> <span class="o">{</span> |
| <span class="n">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">print</span><span class="o">(</span><span class="n">s</span><span class="o">.</span><span class="na">get</span><span class="o">(</span><span class="n">i</span><span class="o">));</span> |
| <span class="o">}</span> |
| |
| <span class="o">}</span> |
| <span class="o">}</span> |
| </code></pre></div> |
| </div> |
| </div> |
| |
| |
| <hr> |
| <footer> |
| <!-- <p>© 2022 The Apache Software Foundation</p>--> |
| </footer> |
| </div> |
| |
| |
| |
| |
| |
| |
| |
| </body> |
| </html> |
| |