| <!-- |
| Licensed to the Apache Software Foundation (ASF) under one |
| or more contributor license agreements. See the NOTICE file |
| distributed with this work for additional information |
| regarding copyright ownership. The ASF licenses this file |
| to you under the Apache License, Version 2.0 (the |
| "License"); you may not use this file except in compliance |
| with the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, |
| software distributed under the License is distributed on an |
| "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| KIND, either express or implied. See the License for the |
| specific language governing permissions and limitations |
| under the License. |
| --> |
| <!DOCTYPE html> |
| |
| <html lang="en"> |
| <head> |
| <meta charset="utf-8"> |
| <meta http-equiv="X-UA-Compatible" content="IE=edge"> |
| <meta name="viewport" content="width=device-width, initial-scale=1"> |
| <!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags --> |
| |
| <title>Apache Flink 0.9.0 Documentation: Quick Start: Run K-Means Example</title> |
| |
| <link rel="shortcut icon" href="http://flink.apache.org/docs/0.9/page/favicon.ico" type="image/x-icon"> |
| <link rel="icon" href="http://flink.apache.org/docs/0.9/page/favicon.ico" type="image/x-icon"> |
| |
| <!-- Bootstrap --> |
| <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css"> |
| <link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/flink.css"> |
| <link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/syntax.css"> |
| <link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/codetabs.css"> |
| |
| <!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries --> |
| <!-- WARNING: Respond.js doesn't work if you view the page via file:// --> |
| <!--[if lt IE 9]> |
| <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script> |
| <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script> |
| <![endif]--> |
| </head> |
| <body> |
| |
| |
| |
| |
| |
| |
| <!-- Top navbar. --> |
| <nav class="navbar navbar-default navbar-fixed-top"> |
| <div class="container"> |
| <!-- The logo. --> |
| <div class="navbar-header"> |
| <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1"> |
| <span class="icon-bar"></span> |
| <span class="icon-bar"></span> |
| <span class="icon-bar"></span> |
| </button> |
| <div class="navbar-logo"> |
| <a href="http://flink.apache.org"><img alt="Apache Flink" src="http://flink.apache.org/docs/0.9/page/img/navbar-brand-logo.jpg"></a> |
| </div> |
| </div><!-- /.navbar-header --> |
| |
| <!-- The navigation links. --> |
| <div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1"> |
| <ul class="nav navbar-nav"> |
| <li><a href="http://flink.apache.org/docs/0.9/index.html">Overview<span class="hidden-sm hidden-xs"> 0.9.0</span></a></li> |
| |
| <!-- Setup --> |
| <li class="dropdown"> |
| <a href="http://flink.apache.org/docs/0.9/setup" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Setup <span class="caret"></span></a> |
| <ul class="dropdown-menu" role="menu"> |
| <li><a href="http://flink.apache.org/docs/0.9/setup/building.html">Get Flink 0.9-SNAPSHOT</a></li> |
| |
| <li class="divider"></li> |
| <li role="presentation" class="dropdown-header"><strong>Deployment</strong></li> |
| <li><a href="http://flink.apache.org/docs/0.9/setup/local_setup.html" class="active">Local</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/setup/cluster_setup.html">Cluster (Standalone)</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/setup/yarn_setup.html">YARN</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/setup/gce_setup.html">GCloud</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/setup/flink_on_tez.html">Flink on Tez <span class="badge">Beta</span></a></li> |
| |
| <li class="divider"></li> |
| <li><a href="http://flink.apache.org/docs/0.9/setup/config.html">Configuration</a></li> |
| </ul> |
| </li> |
| |
| <!-- Programming Guides --> |
| <li class="dropdown"> |
| <a href="http://flink.apache.org/docs/0.9/apis" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Programming Guides <span class="caret"></span></a> |
| <ul class="dropdown-menu" role="menu"> |
| <li><a href="http://flink.apache.org/docs/0.9/apis/programming_guide.html"><strong>Batch: DataSet API</strong></a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/apis/streaming_guide.html"><strong>Streaming: DataStream API</strong> <span class="badge">Beta</span></a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/apis/python.html">Python API <span class="badge">Beta</span></a></li> |
| |
| <li class="divider"></li> |
| <li><a href="scala_shell.html">Interactive Scala Shell</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/apis/dataset_transformations.html">Dataset Transformations</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/apis/best_practices.html">Best Practices</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/apis/example_connectors.html">Connectors</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/apis/examples.html">Examples</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/apis/local_execution.html">Local Execution</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/apis/cluster_execution.html">Cluster Execution</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/apis/cli.html">Command Line Interface</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/apis/web_client.html">Web Client</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/apis/iterations.html">Iterations</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/apis/java8.html">Java 8</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/apis/hadoop_compatibility.html">Hadoop Compatability <span class="badge">Beta</span></a></li> |
| </ul> |
| </li> |
| |
| <!-- Libraries --> |
| <li class="dropdown"> |
| <a href="http://flink.apache.org/docs/0.9/libs" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Libraries <span class="caret"></span></a> |
| <ul class="dropdown-menu" role="menu"> |
| <li><a href="http://flink.apache.org/docs/0.9/libs/spargel_guide.html">Graphs: Spargel</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/libs/gelly_guide.html">Graphs: Gelly <span class="badge">Beta</span></a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/libs/ml/">Machine Learning <span class="badge">Beta</span></a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/libs/table.html">Relational: Table <span class="badge">Beta</span></a></li> |
| </ul> |
| </li> |
| |
| <!-- Internals --> |
| <li class="dropdown"> |
| <a href="http://flink.apache.org/docs/0.9/internals" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Internals <span class="caret"></span></a> |
| <ul class="dropdown-menu" role="menu"> |
| <li role="presentation" class="dropdown-header"><strong>Contribute</strong></li> |
| <li><a href="http://flink.apache.org/docs/0.9/internals/how_to_contribute.html">How to Contribute</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/internals/coding_guidelines.html">Coding Guidelines</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/internals/ide_setup.html">IDE Setup</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/internals/logging.html">Logging</a></li> |
| <li class="divider"></li> |
| <li role="presentation" class="dropdown-header"><strong>Internals</strong></li> |
| <li><a href="http://flink.apache.org/docs/0.9/internals/general_arch.html">Architecture & Process Model</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/internals/types_serialization.html">Type Extraction & Serialization</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/internals/job_scheduling.html">Jobs & Scheduling</a></li> |
| <li><a href="http://flink.apache.org/docs/0.9/internals/add_operator.html">How-To: Add an Operator</a></li> |
| </ul> |
| </li> |
| </ul> |
| <form class="navbar-form navbar-right hidden-sm hidden-md" role="search" action="http://flink.apache.org/docs/0.9/search-results.html"> |
| <div class="form-group"> |
| <input type="text" class="form-control" name="q" placeholder="Search all pages"> |
| </div> |
| <button type="submit" class="btn btn-default">Search</button> |
| </form> |
| </div><!-- /.navbar-collapse --> |
| </div><!-- /.container --> |
| </nav> |
| |
| |
| |
| |
| <!-- Main content. --> |
| <div class="container"> |
| |
| |
| <div class="row"> |
| <div class="col-sm-10 col-sm-offset-1"> |
| <h1>Quick Start: Run K-Means Example</h1> |
| |
| |
| |
| <ul id="markdown-toc"> |
| <li><a href="#setup-flink" id="markdown-toc-setup-flink">Setup Flink</a></li> |
| <li><a href="#generate-input-data" id="markdown-toc-generate-input-data">Generate Input Data</a></li> |
| <li><a href="#inspect-the-input-data" id="markdown-toc-inspect-the-input-data">Inspect the Input Data</a></li> |
| <li><a href="#start-flink" id="markdown-toc-start-flink">Start Flink</a></li> |
| <li><a href="#inspect-and-run-the-k-means-example-program" id="markdown-toc-inspect-and-run-the-k-means-example-program">Inspect and Run the K-Means Example Program</a></li> |
| <li><a href="#shutdown-flink" id="markdown-toc-shutdown-flink">Shutdown Flink</a></li> |
| <li><a href="#analyze-the-result" id="markdown-toc-analyze-the-result">Analyze the Result</a></li> |
| </ul> |
| |
| <p>This guide walks you through the steps of executing an example program (<a href="http://en.wikipedia.org/wiki/K-means_clustering">K-Means clustering</a>) on Flink. On the way, you will see the a visualization of the program, the optimized execution plan, and track the progress of its execution.</p> |
| |
| <h2 id="setup-flink">Setup Flink</h2> |
| <p>Follow the <a href="setup_quickstart.html">instructions</a> to setup Flink and enter the root directory of your Flink setup.</p> |
| |
| <h2 id="generate-input-data">Generate Input Data</h2> |
| <p>Flink contains a data generator for K-Means.</p> |
| |
| <div class="highlight"><pre><code class="language-bash"><span class="c"># Assuming you are in the root directory of your Flink setup</span> |
| mkdir kmeans |
| <span class="nb">cd </span>kmeans |
| <span class="c"># Run data generator</span> |
| java -cp ../examples/flink-java-examples-0.9.0-KMeans.jar:../lib/flink-dist-0.9.0.jar <span class="se">\</span> |
| org.apache.flink.examples.java.clustering.util.KMeansDataGenerator <span class="se">\</span> |
| -points <span class="m">500</span> -k <span class="m">10</span> -stddev 0.08 -output <span class="sb">`</span><span class="nb">pwd</span><span class="sb">`</span></code></pre></div> |
| |
| <p>The generator has the following arguments (arguments in <code>[]</code> are optional):</p> |
| |
| <div class="highlight"><pre><code class="language-bash">-points <num> -k <num clusters> <span class="o">[</span>-output <output-path><span class="o">]</span> <span class="o">[</span>-stddev <relative stddev><span class="o">]</span> <span class="o">[</span>-range <centroid range><span class="o">]</span> <span class="o">[</span>-seed <seed><span class="o">]</span></code></pre></div> |
| |
| <p>The <em>relative standard deviation</em> is an interesting tuning parameter. It determines the closeness of the points to randomly generated centers.</p> |
| |
| <p>The <code>kmeans/</code> directory should now contain two files: <code>centers</code> and <code>points</code>. The <code>points</code> file contains the points to cluster and the <code>centers</code> file contains initial cluster centers.</p> |
| |
| <h2 id="inspect-the-input-data">Inspect the Input Data</h2> |
| <p>Use the <code>plotPoints.py</code> tool to review the generated data points. <a href="quickstart/plotPoints.py">Download Python Script</a></p> |
| |
| <div class="highlight"><pre><code class="language-bash">python plotPoints.py points ./points input</code></pre></div> |
| |
| <p>Note: You might have to install <a href="http://matplotlib.org/">matplotlib</a> (<code>python-matplotlib</code> package on Ubuntu) to use the Python script.</p> |
| |
| <p>You can review the input data stored in the <code>input-plot.pdf</code>, for example with Evince (<code>evince input-plot.pdf</code>).</p> |
| |
| <p>The following overview presents the impact of the different standard deviations on the input data.</p> |
| |
| <table> |
| <thead> |
| <tr> |
| <th style="text-align: center">relative stddev = 0.03</th> |
| <th style="text-align: center">relative stddev = 0.08</th> |
| <th style="text-align: center">relative stddev = 0.15</th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr> |
| <td style="text-align: center"><img src="http://flink.apache.org/docs/0.9/page/img/quickstart-example/kmeans003.png" alt="example1" style="width: 275px;" /></td> |
| <td style="text-align: center"><img src="http://flink.apache.org/docs/0.9/page/img/quickstart-example/kmeans008.png" alt="example2" style="width: 275px;" /></td> |
| <td style="text-align: center"><img src="http://flink.apache.org/docs/0.9/page/img/quickstart-example/kmeans015.png" alt="example3" style="width: 275px;" /></td> |
| </tr> |
| </tbody> |
| </table> |
| |
| <h2 id="start-flink">Start Flink</h2> |
| <p>Start Flink and the web job submission client on your local machine.</p> |
| |
| <div class="highlight"><pre><code class="language-bash"><span class="c"># return to the Flink root directory</span> |
| <span class="nb">cd</span> .. |
| <span class="c"># start Flink</span> |
| ./bin/start-local.sh |
| <span class="c"># Start the web client</span> |
| ./bin/start-webclient.sh</code></pre></div> |
| |
| <h2 id="inspect-and-run-the-k-means-example-program">Inspect and Run the K-Means Example Program</h2> |
| <p>The Flink web client allows to submit Flink programs using a graphical user interface.</p> |
| |
| <div class="row" style="padding-top:15px"> |
| <div class="col-md-6"> |
| <a data-lightbox="example-1" href="http://flink.apache.org/docs/0.9/page/img/quickstart-example/run-webclient.png"><img class="img-responsive" src="http://flink.apache.org/docs/0.9/page/img/quickstart-example/run-webclient.png" /></a> |
| </div> |
| <div class="col-md-6"> |
| 1. Open web client on <a href="http://localhost:8080/launch.html">localhost:8080</a> <br /> |
| 2. Upload the K-Mean job JAR file. |
| |
| <div class="highlight"><pre><code class="language-bash" data-lang="bash">./examples/flink-java-examples-*-KMeans.jar</code></pre></div> |
| |
| 3. Select it in the left box to see how the operators in the plan are connected to each other. <br /> |
| 4. Enter the arguments in the lower left box: |
| |
| <div class="highlight"><pre><code class="language-bash" data-lang="bash">file://<pathToFlink>/kmeans/points file://<pathToFlink>/kmeans/centers file://<pathToFlink>/kmeans/result 10</code></pre></div> |
| |
| For example: |
| |
| <div class="highlight"><pre><code class="language-bash" data-lang="bash">file:///tmp/flink/kmeans/points file:///tmp/flink/kmeans/centers file:///tmp/flink/kmeans/result 10</code></pre></div> |
| |
| </div> |
| </div> |
| <hr /> |
| |
| <div class="row" style="padding-top:15px"> |
| <div class="col-md-6"> |
| <a data-lightbox="example-1" href="http://flink.apache.org/docs/0.9/page/img/quickstart-example/compiler-webclient-new.png"><img class="img-responsive" src="http://flink.apache.org/docs/0.9/page/img/quickstart-example/compiler-webclient-new.png" /></a> |
| </div> |
| |
| <div class="col-md-6"> |
| 1. Press the <b>RunJob</b> to see the optimizer plan. <br /> |
| 2. Inspect the operators and see the properties (input sizes, cost estimation) determined by the optimizer. |
| </div> |
| </div> |
| <hr /> |
| |
| <div class="row" style="padding-top:15px"> |
| <div class="col-md-6"> |
| <a data-lightbox="example-1" href="http://flink.apache.org/docs/0.9/page/img/quickstart-example/jobmanager-running-new.png"><img class="img-responsive" src="http://flink.apache.org/docs/0.9/page/img/quickstart-example/jobmanager-running-new.png" /></a> |
| </div> |
| <div class="col-md-6"> |
| 1. Press the <b>Continue</b> button to start executing the job. <br /> |
| 2. <a href="http://localhost:8080/launch.html">Open Flink's monitoring interface</a> to see the job's progress. (Due to the small input data, the job will finish really quick!)<br /> |
| 3. Once the job has finished, you can analyze the runtime of the individual operators. |
| </div> |
| </div> |
| |
| <h2 id="shutdown-flink">Shutdown Flink</h2> |
| <p>Stop Flink when you are done.</p> |
| |
| <div class="highlight"><pre><code class="language-bash"><span class="c"># stop Flink</span> |
| ./bin/stop-local.sh |
| <span class="c"># Stop the Flink web client</span> |
| ./bin/stop-webclient.sh</code></pre></div> |
| |
| <h2 id="analyze-the-result">Analyze the Result</h2> |
| <p>Use the <a href="quickstart/plotPoints.py">Python Script</a> again to visualize the result.</p> |
| |
| <div class="highlight"><pre><code class="language-bash"><span class="nb">cd </span>kmeans |
| python plotPoints.py result ./result clusters</code></pre></div> |
| |
| <p>The following three pictures show the results for the sample input above. Play around with the parameters (number of iterations, number of clusters) to see how they affect the result.</p> |
| |
| <table> |
| <thead> |
| <tr> |
| <th style="text-align: center">relative stddev = 0.03</th> |
| <th style="text-align: center">relative stddev = 0.08</th> |
| <th style="text-align: center">relative stddev = 0.15</th> |
| </tr> |
| </thead> |
| <tbody> |
| <tr> |
| <td style="text-align: center"><img src="http://flink.apache.org/docs/0.9/page/img/quickstart-example/result003.png" alt="example1" style="width: 275px;" /></td> |
| <td style="text-align: center"><img src="http://flink.apache.org/docs/0.9/page/img/quickstart-example/result008.png" alt="example2" style="width: 275px;" /></td> |
| <td style="text-align: center"><img src="http://flink.apache.org/docs/0.9/page/img/quickstart-example/result015.png" alt="example3" style="width: 275px;" /></td> |
| </tr> |
| </tbody> |
| </table> |
| |
| </div> |
| |
| <div class="col-sm-10 col-sm-offset-1"> |
| <!-- Disqus thread and some vertical offset --> |
| <div style="margin-top: 75px; margin-bottom: 50px" id="disqus_thread"></div> |
| </div> |
| </div> |
| |
| </div><!-- /.container --> |
| |
| <!-- jQuery (necessary for Bootstrap's JavaScript plugins) --> |
| <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.2/jquery.min.js"></script> |
| <!-- Include all compiled plugins (below), or include individual files as needed --> |
| <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script> |
| <script src="http://flink.apache.org/docs/0.9/page/js/codetabs.js"></script> |
| |
| <!-- Google Analytics --> |
| <script> |
| (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ |
| (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), |
| m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) |
| })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); |
| |
| ga('create', 'UA-52545728-1', 'auto'); |
| ga('send', 'pageview'); |
| </script> |
| |
| <!-- Disqus --> |
| <script type="text/javascript"> |
| var disqus_shortname = 'stratosphere-eu'; |
| (function() { |
| var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true; |
| dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js'; |
| (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq); |
| })(); |
| </script> |
| </body> |
| </html> |