blob: afde0f4c8f5e616d5d554d0f5673702cc050f5b1 [file] [log] [blame]
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
<title>Apache Flink 0.10-SNAPSHOT Documentation: Quick Start: Run K-Means Example</title>
<link rel="shortcut icon" href="http://flink.apache.org/docs/master/page/favicon.ico" type="image/x-icon">
<link rel="icon" href="http://flink.apache.org/docs/master/page/favicon.ico" type="image/x-icon">
<!-- Bootstrap -->
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css">
<link rel="stylesheet" href="http://flink.apache.org/docs/master/page/css/flink.css">
<link rel="stylesheet" href="http://flink.apache.org/docs/master/page/css/syntax.css">
<link rel="stylesheet" href="http://flink.apache.org/docs/master/page/css/codetabs.css">
<!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
<!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
<!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]-->
</head>
<body>
<!-- Top navbar. -->
<nav class="navbar navbar-default navbar-fixed-top">
<div class="container">
<!-- The logo. -->
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1">
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<div class="navbar-logo">
<a href="http://flink.apache.org"><img alt="Apache Flink" src="http://flink.apache.org/docs/master/page/img/navbar-brand-logo.jpg"></a>
</div>
</div><!-- /.navbar-header -->
<!-- The navigation links. -->
<div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1">
<ul class="nav navbar-nav">
<li><a href="http://flink.apache.org/docs/master/index.html">Overview<span class="hidden-sm hidden-xs"> 0.10</span></a></li>
<!-- Setup -->
<li class="dropdown">
<a href="http://flink.apache.org/docs/master/setup" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Setup <span class="caret"></span></a>
<ul class="dropdown-menu" role="menu">
<li><a href="http://flink.apache.org/docs/master/setup/building.html">Get Flink 0.10-SNAPSHOT</a></li>
<li class="divider"></li>
<li role="presentation" class="dropdown-header"><strong>Deployment</strong></li>
<li><a href="http://flink.apache.org/docs/master/setup/local_setup.html" class="active">Local</a></li>
<li><a href="http://flink.apache.org/docs/master/setup/cluster_setup.html">Cluster (Standalone)</a></li>
<li><a href="http://flink.apache.org/docs/master/setup/yarn_setup.html">YARN</a></li>
<li><a href="http://flink.apache.org/docs/master/setup/gce_setup.html">GCloud</a></li>
<li><a href="http://flink.apache.org/docs/master/setup/flink_on_tez.html">Flink on Tez <span class="badge">Beta</span></a></li>
<li class="divider"></li>
<li><a href="http://flink.apache.org/docs/master/setup/config.html">Configuration</a></li>
</ul>
</li>
<!-- Programming Guides -->
<li class="dropdown">
<a href="http://flink.apache.org/docs/master/apis" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Programming Guides <span class="caret"></span></a>
<ul class="dropdown-menu" role="menu">
<li><a href="http://flink.apache.org/docs/master/apis/programming_guide.html"><strong>Batch: DataSet API</strong></a></li>
<li><a href="http://flink.apache.org/docs/master/apis/streaming_guide.html"><strong>Streaming: DataStream API</strong> <span class="badge">Beta</span></a></li>
<li><a href="http://flink.apache.org/docs/master/apis/python.html">Python API <span class="badge">Beta</span></a></li>
<li class="divider"></li>
<li><a href="scala_shell.html">Interactive Scala Shell</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/dataset_transformations.html">Dataset Transformations</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/best_practices.html">Best Practices</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/example_connectors.html">Connectors</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/examples.html">Examples</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/local_execution.html">Local Execution</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/cluster_execution.html">Cluster Execution</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/cli.html">Command Line Interface</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/web_client.html">Web Client</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/iterations.html">Iterations</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/java8.html">Java 8</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/hadoop_compatibility.html">Hadoop Compatability <span class="badge">Beta</span></a></li>
</ul>
</li>
<!-- Libraries -->
<li class="dropdown">
<a href="http://flink.apache.org/docs/master/libs" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Libraries <span class="caret"></span></a>
<ul class="dropdown-menu" role="menu">
<li><a href="http://flink.apache.org/docs/master/libs/spargel_guide.html">Graphs: Spargel</a></li>
<li><a href="http://flink.apache.org/docs/master/libs/gelly_guide.html">Graphs: Gelly <span class="badge">Beta</span></a></li>
<li><a href="http://flink.apache.org/docs/master/libs/ml/">Machine Learning <span class="badge">Beta</span></a></li>
<li><a href="http://flink.apache.org/docs/master/libs/table.html">Relational: Table <span class="badge">Beta</span></a></li>
</ul>
</li>
<!-- Internals -->
<li class="dropdown">
<a href="http://flink.apache.org/docs/master/internals" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Internals <span class="caret"></span></a>
<ul class="dropdown-menu" role="menu">
<li role="presentation" class="dropdown-header"><strong>Contribute</strong></li>
<li><a href="http://flink.apache.org/docs/master/internals/how_to_contribute.html">How to Contribute</a></li>
<li><a href="http://flink.apache.org/docs/master/internals/coding_guidelines.html">Coding Guidelines</a></li>
<li><a href="http://flink.apache.org/docs/master/internals/ide_setup.html">IDE Setup</a></li>
<li><a href="http://flink.apache.org/docs/master/internals/logging.html">Logging</a></li>
<li class="divider"></li>
<li role="presentation" class="dropdown-header"><strong>Internals</strong></li>
<li><a href="http://flink.apache.org/docs/master/internals/general_arch.html">Architecture &amp; Process Model</a></li>
<li><a href="http://flink.apache.org/docs/master/internals/types_serialization.html">Type Extraction &amp; Serialization</a></li>
<li><a href="http://flink.apache.org/docs/master/internals/job_scheduling.html">Jobs &amp; Scheduling</a></li>
<li><a href="http://flink.apache.org/docs/master/internals/add_operator.html">How-To: Add an Operator</a></li>
</ul>
</li>
</ul>
<form class="navbar-form navbar-right hidden-sm hidden-md" role="search" action="http://flink.apache.org/docs/master/search-results.html">
<div class="form-group">
<input type="text" class="form-control" name="q" placeholder="Search all pages">
</div>
<button type="submit" class="btn btn-default">Search</button>
</form>
</div><!-- /.navbar-collapse -->
</div><!-- /.container -->
</nav>
<!-- Main content. -->
<div class="container">
<div class="row">
<div class="col-sm-10 col-sm-offset-1">
<h1>Quick Start: Run K-Means Example</h1>
<ul id="markdown-toc">
<li><a href="#setup-flink" id="markdown-toc-setup-flink">Setup Flink</a></li>
<li><a href="#generate-input-data" id="markdown-toc-generate-input-data">Generate Input Data</a></li>
<li><a href="#inspect-the-input-data" id="markdown-toc-inspect-the-input-data">Inspect the Input Data</a></li>
<li><a href="#start-flink" id="markdown-toc-start-flink">Start Flink</a></li>
<li><a href="#inspect-and-run-the-k-means-example-program" id="markdown-toc-inspect-and-run-the-k-means-example-program">Inspect and Run the K-Means Example Program</a></li>
<li><a href="#shutdown-flink" id="markdown-toc-shutdown-flink">Shutdown Flink</a></li>
<li><a href="#analyze-the-result" id="markdown-toc-analyze-the-result">Analyze the Result</a></li>
</ul>
<p>This guide walks you through the steps of executing an example program (<a href="http://en.wikipedia.org/wiki/K-means_clustering">K-Means clustering</a>) on Flink. On the way, you will see the a visualization of the program, the optimized execution plan, and track the progress of its execution.</p>
<h2 id="setup-flink">Setup Flink</h2>
<p>Follow the <a href="setup_quickstart.html">instructions</a> to setup Flink and enter the root directory of your Flink setup.</p>
<h2 id="generate-input-data">Generate Input Data</h2>
<p>Flink contains a data generator for K-Means.</p>
<div class="highlight"><pre><code class="language-bash"><span class="c"># Assuming you are in the root directory of your Flink setup</span>
mkdir kmeans
<span class="nb">cd </span>kmeans
<span class="c"># Run data generator</span>
java -cp ../examples/flink-java-examples-0.10-SNAPSHOT-KMeans.jar:../lib/flink-dist-0.10-SNAPSHOT.jar <span class="se">\</span>
org.apache.flink.examples.java.clustering.util.KMeansDataGenerator <span class="se">\</span>
-points <span class="m">500</span> -k <span class="m">10</span> -stddev 0.08 -output <span class="sb">`</span><span class="nb">pwd</span><span class="sb">`</span></code></pre></div>
<p>The generator has the following arguments (arguments in <code>[]</code> are optional):</p>
<div class="highlight"><pre><code class="language-bash">-points &lt;num&gt; -k &lt;num clusters&gt; <span class="o">[</span>-output &lt;output-path&gt;<span class="o">]</span> <span class="o">[</span>-stddev &lt;relative stddev&gt;<span class="o">]</span> <span class="o">[</span>-range &lt;centroid range&gt;<span class="o">]</span> <span class="o">[</span>-seed &lt;seed&gt;<span class="o">]</span></code></pre></div>
<p>The <em>relative standard deviation</em> is an interesting tuning parameter. It determines the closeness of the points to randomly generated centers.</p>
<p>The <code>kmeans/</code> directory should now contain two files: <code>centers</code> and <code>points</code>. The <code>points</code> file contains the points to cluster and the <code>centers</code> file contains initial cluster centers.</p>
<h2 id="inspect-the-input-data">Inspect the Input Data</h2>
<p>Use the <code>plotPoints.py</code> tool to review the generated data points. <a href="quickstart/plotPoints.py">Download Python Script</a></p>
<div class="highlight"><pre><code class="language-bash">python plotPoints.py points ./points input</code></pre></div>
<p>Note: You might have to install <a href="http://matplotlib.org/">matplotlib</a> (<code>python-matplotlib</code> package on Ubuntu) to use the Python script.</p>
<p>You can review the input data stored in the <code>input-plot.pdf</code>, for example with Evince (<code>evince input-plot.pdf</code>).</p>
<p>The following overview presents the impact of the different standard deviations on the input data.</p>
<table>
<thead>
<tr>
<th style="text-align: center">relative stddev = 0.03</th>
<th style="text-align: center">relative stddev = 0.08</th>
<th style="text-align: center">relative stddev = 0.15</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align: center"><img src="http://flink.apache.org/docs/master/page/img/quickstart-example/kmeans003.png" alt="example1" style="width: 275px;" /></td>
<td style="text-align: center"><img src="http://flink.apache.org/docs/master/page/img/quickstart-example/kmeans008.png" alt="example2" style="width: 275px;" /></td>
<td style="text-align: center"><img src="http://flink.apache.org/docs/master/page/img/quickstart-example/kmeans015.png" alt="example3" style="width: 275px;" /></td>
</tr>
</tbody>
</table>
<h2 id="start-flink">Start Flink</h2>
<p>Start Flink and the web job submission client on your local machine.</p>
<div class="highlight"><pre><code class="language-bash"><span class="c"># return to the Flink root directory</span>
<span class="nb">cd</span> ..
<span class="c"># start Flink</span>
./bin/start-local.sh
<span class="c"># Start the web client</span>
./bin/start-webclient.sh</code></pre></div>
<h2 id="inspect-and-run-the-k-means-example-program">Inspect and Run the K-Means Example Program</h2>
<p>The Flink web client allows to submit Flink programs using a graphical user interface.</p>
<div class="row" style="padding-top:15px">
<div class="col-md-6">
<a data-lightbox="example-1" href="http://flink.apache.org/docs/master/page/img/quickstart-example/run-webclient.png"><img class="img-responsive" src="http://flink.apache.org/docs/master/page/img/quickstart-example/run-webclient.png" /></a>
</div>
<div class="col-md-6">
1. Open web client on <a href="http://localhost:8080/launch.html">localhost:8080</a> <br />
2. Upload the K-Mean job JAR file.
<div class="highlight"><pre><code class="language-bash" data-lang="bash">./examples/flink-java-examples-*-KMeans.jar</code></pre></div>
3. Select it in the left box to see how the operators in the plan are connected to each other. <br />
4. Enter the arguments in the lower left box:
<div class="highlight"><pre><code class="language-bash" data-lang="bash">file://&lt;pathToFlink&gt;/kmeans/points file://&lt;pathToFlink&gt;/kmeans/centers file://&lt;pathToFlink&gt;/kmeans/result 10</code></pre></div>
For example:
<div class="highlight"><pre><code class="language-bash" data-lang="bash">file:///tmp/flink/kmeans/points file:///tmp/flink/kmeans/centers file:///tmp/flink/kmeans/result 10</code></pre></div>
</div>
</div>
<hr />
<div class="row" style="padding-top:15px">
<div class="col-md-6">
<a data-lightbox="example-1" href="http://flink.apache.org/docs/master/page/img/quickstart-example/compiler-webclient-new.png"><img class="img-responsive" src="http://flink.apache.org/docs/master/page/img/quickstart-example/compiler-webclient-new.png" /></a>
</div>
<div class="col-md-6">
1. Press the <b>RunJob</b> to see the optimizer plan. <br />
2. Inspect the operators and see the properties (input sizes, cost estimation) determined by the optimizer.
</div>
</div>
<hr />
<div class="row" style="padding-top:15px">
<div class="col-md-6">
<a data-lightbox="example-1" href="http://flink.apache.org/docs/master/page/img/quickstart-example/jobmanager-running-new.png"><img class="img-responsive" src="http://flink.apache.org/docs/master/page/img/quickstart-example/jobmanager-running-new.png" /></a>
</div>
<div class="col-md-6">
1. Press the <b>Continue</b> button to start executing the job. <br />
2. <a href="http://localhost:8080/launch.html">Open Flink's monitoring interface</a> to see the job's progress. (Due to the small input data, the job will finish really quick!)<br />
3. Once the job has finished, you can analyze the runtime of the individual operators.
</div>
</div>
<h2 id="shutdown-flink">Shutdown Flink</h2>
<p>Stop Flink when you are done.</p>
<div class="highlight"><pre><code class="language-bash"><span class="c"># stop Flink</span>
./bin/stop-local.sh
<span class="c"># Stop the Flink web client</span>
./bin/stop-webclient.sh</code></pre></div>
<h2 id="analyze-the-result">Analyze the Result</h2>
<p>Use the <a href="quickstart/plotPoints.py">Python Script</a> again to visualize the result.</p>
<div class="highlight"><pre><code class="language-bash"><span class="nb">cd </span>kmeans
python plotPoints.py result ./result clusters</code></pre></div>
<p>The following three pictures show the results for the sample input above. Play around with the parameters (number of iterations, number of clusters) to see how they affect the result.</p>
<table>
<thead>
<tr>
<th style="text-align: center">relative stddev = 0.03</th>
<th style="text-align: center">relative stddev = 0.08</th>
<th style="text-align: center">relative stddev = 0.15</th>
</tr>
</thead>
<tbody>
<tr>
<td style="text-align: center"><img src="http://flink.apache.org/docs/master/page/img/quickstart-example/result003.png" alt="example1" style="width: 275px;" /></td>
<td style="text-align: center"><img src="http://flink.apache.org/docs/master/page/img/quickstart-example/result008.png" alt="example2" style="width: 275px;" /></td>
<td style="text-align: center"><img src="http://flink.apache.org/docs/master/page/img/quickstart-example/result015.png" alt="example3" style="width: 275px;" /></td>
</tr>
</tbody>
</table>
</div>
<div class="col-sm-10 col-sm-offset-1">
<!-- Disqus thread and some vertical offset -->
<div style="margin-top: 75px; margin-bottom: 50px" id="disqus_thread"></div>
</div>
</div>
</div><!-- /.container -->
<!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.2/jquery.min.js"></script>
<!-- Include all compiled plugins (below), or include individual files as needed -->
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script>
<script src="http://flink.apache.org/docs/master/page/js/codetabs.js"></script>
<!-- Google Analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-52545728-1', 'auto');
ga('send', 'pageview');
</script>
<!-- Disqus -->
<script type="text/javascript">
var disqus_shortname = 'stratosphere-eu';
(function() {
var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
(document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
})();
</script>
</body>
</html>