blob: 577e5e1a0444c6fe92e6aacd083da2e12ce9696c [file] [log] [blame]
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
<title>Apache Flink 0.10-SNAPSHOT Documentation: FlinkML - SVM using CoCoA</title>
<link rel="shortcut icon" href="http://flink.apache.org/docs/master/page/favicon.ico" type="image/x-icon">
<link rel="icon" href="http://flink.apache.org/docs/master/page/favicon.ico" type="image/x-icon">
<!-- Bootstrap -->
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css">
<link rel="stylesheet" href="http://flink.apache.org/docs/master/page/css/flink.css">
<link rel="stylesheet" href="http://flink.apache.org/docs/master/page/css/syntax.css">
<link rel="stylesheet" href="http://flink.apache.org/docs/master/page/css/codetabs.css">
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
tex2jax: {
inlineMath: [['$','$'], ['\\(','\\)']] },
TeX: {
equationNumbers: { autoNumber: "AMS" } }
});
</script>
<script type="text/javascript"
src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
</script>
<!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
<!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
<!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]-->
</head>
<body>
<!-- Top navbar. -->
<nav class="navbar navbar-default navbar-fixed-top">
<div class="container">
<!-- The logo. -->
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1">
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<div class="navbar-logo">
<a href="http://flink.apache.org"><img alt="Apache Flink" src="http://flink.apache.org/docs/master/page/img/navbar-brand-logo.jpg"></a>
</div>
</div><!-- /.navbar-header -->
<!-- The navigation links. -->
<div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1">
<ul class="nav navbar-nav">
<li><a href="http://flink.apache.org/docs/master/index.html">Overview<span class="hidden-sm hidden-xs"> 0.10</span></a></li>
<!-- Setup -->
<li class="dropdown">
<a href="http://flink.apache.org/docs/master/setup" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Setup <span class="caret"></span></a>
<ul class="dropdown-menu" role="menu">
<li><a href="http://flink.apache.org/docs/master/setup/building.html">Get Flink 0.10-SNAPSHOT</a></li>
<li class="divider"></li>
<li role="presentation" class="dropdown-header"><strong>Deployment</strong></li>
<li><a href="http://flink.apache.org/docs/master/setup/local_setup.html" class="active">Local</a></li>
<li><a href="http://flink.apache.org/docs/master/setup/cluster_setup.html">Cluster (Standalone)</a></li>
<li><a href="http://flink.apache.org/docs/master/setup/yarn_setup.html">YARN</a></li>
<li><a href="http://flink.apache.org/docs/master/setup/gce_setup.html">GCloud</a></li>
<li><a href="http://flink.apache.org/docs/master/setup/flink_on_tez.html">Flink on Tez <span class="badge">Beta</span></a></li>
<li class="divider"></li>
<li><a href="http://flink.apache.org/docs/master/setup/config.html">Configuration</a></li>
</ul>
</li>
<!-- Programming Guides -->
<li class="dropdown">
<a href="http://flink.apache.org/docs/master/apis" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Programming Guides <span class="caret"></span></a>
<ul class="dropdown-menu" role="menu">
<li><a href="http://flink.apache.org/docs/master/apis/programming_guide.html"><strong>Batch: DataSet API</strong></a></li>
<li><a href="http://flink.apache.org/docs/master/apis/streaming_guide.html"><strong>Streaming: DataStream API</strong> <span class="badge">Beta</span></a></li>
<li><a href="http://flink.apache.org/docs/master/apis/python.html">Python API <span class="badge">Beta</span></a></li>
<li class="divider"></li>
<li><a href="scala_shell.html">Interactive Scala Shell</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/dataset_transformations.html">Dataset Transformations</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/best_practices.html">Best Practices</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/example_connectors.html">Connectors</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/examples.html">Examples</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/local_execution.html">Local Execution</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/cluster_execution.html">Cluster Execution</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/cli.html">Command Line Interface</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/web_client.html">Web Client</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/iterations.html">Iterations</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/java8.html">Java 8</a></li>
<li><a href="http://flink.apache.org/docs/master/apis/hadoop_compatibility.html">Hadoop Compatability <span class="badge">Beta</span></a></li>
</ul>
</li>
<!-- Libraries -->
<li class="dropdown">
<a href="http://flink.apache.org/docs/master/libs" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Libraries <span class="caret"></span></a>
<ul class="dropdown-menu" role="menu">
<li><a href="http://flink.apache.org/docs/master/libs/spargel_guide.html">Graphs: Spargel</a></li>
<li><a href="http://flink.apache.org/docs/master/libs/gelly_guide.html">Graphs: Gelly <span class="badge">Beta</span></a></li>
<li><a href="http://flink.apache.org/docs/master/libs/ml/">Machine Learning <span class="badge">Beta</span></a></li>
<li><a href="http://flink.apache.org/docs/master/libs/table.html">Relational: Table <span class="badge">Beta</span></a></li>
</ul>
</li>
<!-- Internals -->
<li class="dropdown">
<a href="http://flink.apache.org/docs/master/internals" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Internals <span class="caret"></span></a>
<ul class="dropdown-menu" role="menu">
<li role="presentation" class="dropdown-header"><strong>Contribute</strong></li>
<li><a href="http://flink.apache.org/docs/master/internals/how_to_contribute.html">How to Contribute</a></li>
<li><a href="http://flink.apache.org/docs/master/internals/coding_guidelines.html">Coding Guidelines</a></li>
<li><a href="http://flink.apache.org/docs/master/internals/ide_setup.html">IDE Setup</a></li>
<li><a href="http://flink.apache.org/docs/master/internals/logging.html">Logging</a></li>
<li class="divider"></li>
<li role="presentation" class="dropdown-header"><strong>Internals</strong></li>
<li><a href="http://flink.apache.org/docs/master/internals/general_arch.html">Architecture &amp; Process Model</a></li>
<li><a href="http://flink.apache.org/docs/master/internals/types_serialization.html">Type Extraction &amp; Serialization</a></li>
<li><a href="http://flink.apache.org/docs/master/internals/job_scheduling.html">Jobs &amp; Scheduling</a></li>
<li><a href="http://flink.apache.org/docs/master/internals/add_operator.html">How-To: Add an Operator</a></li>
</ul>
</li>
</ul>
<form class="navbar-form navbar-right hidden-sm hidden-md" role="search" action="http://flink.apache.org/docs/master/search-results.html">
<div class="form-group">
<input type="text" class="form-control" name="q" placeholder="Search all pages">
</div>
<button type="submit" class="btn btn-default">Search</button>
</form>
</div><!-- /.navbar-collapse -->
</div><!-- /.container -->
</nav>
<!--Some of the Latex math notation has been adapted from Apache Spark MLlib's documentation-->
$$
\newcommand{\R}{\mathbb{R}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\newcommand{\wv}{\mathbf{w}}
\newcommand{\av}{\mathbf{\alpha}}
\newcommand{\bv}{\mathbf{b}}
\newcommand{\N}{\mathbb{N}}
\newcommand{\id}{\mathbf{I}}
\newcommand{\ind}{\mathbf{1}}
\newcommand{\0}{\mathbf{0}}
\newcommand{\unit}{\mathbf{e}}
\newcommand{\one}{\mathbf{1}}
\newcommand{\zero}{\mathbf{0}}
\newcommand\rfrac[2]{^{#1}\!/_{#2}}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
$$
<!-- Main content. -->
<div class="container">
<div class="row">
<div class="col-sm-10 col-sm-offset-1">
<h1><a href="../ml">FlinkML</a> - SVM using CoCoA</h1>
<ul id="markdown-toc">
<li><a href="#description" id="markdown-toc-description">Description</a></li>
<li><a href="#operations" id="markdown-toc-operations">Operations</a> <ul>
<li><a href="#fit" id="markdown-toc-fit">Fit</a></li>
<li><a href="#predict" id="markdown-toc-predict">Predict</a></li>
</ul>
</li>
<li><a href="#parameters" id="markdown-toc-parameters">Parameters</a></li>
<li><a href="#examples" id="markdown-toc-examples">Examples</a></li>
</ul>
<h2 id="description">Description</h2>
<p>Implements an SVM with soft-margin using the communication-efficient distributed dual coordinate
ascent algorithm with hinge-loss function.
The algorithm solves the following minimization problem:</p>
<script type="math/tex; mode=display">\min_{\mathbf{w} \in \mathbb{R}^d} \frac{\lambda}{2} \left\lVert \mathbf{w} \right\rVert^2 + \frac{1}{n} \sum_{i=1}^n l_{i}\left(\mathbf{w}^T\mathbf{x}_i\right)</script>
<p>with $\mathbf{w}$ being the weight vector, $\lambda$ being the regularization constant,
<script type="math/tex">\mathbf{x}_i \in \mathbb{R}^d</script> being the data points and <script type="math/tex">l_{i}</script> being the convex loss
functions, which can also depend on the labels <script type="math/tex">y_{i} \in \mathbb{R}</script>.
In the current implementation the regularizer is the $\ell_2$-norm and the loss functions are the hinge-loss functions:</p>
<script type="math/tex; mode=display">l_{i} = \max\left(0, 1 - y_{i} \mathbf{w}^T\mathbf{x}_i \right)</script>
<p>With these choices, the problem definition is equivalent to a SVM with soft-margin.
Thus, the algorithm allows us to train a SVM with soft-margin.</p>
<p>The minimization problem is solved by applying stochastic dual coordinate ascent (SDCA).
In order to make the algorithm efficient in a distributed setting, the CoCoA algorithm calculates
several iterations of SDCA locally on a data block before merging the local updates into a
valid global state.
This state is redistributed to the different data partitions where the next round of local SDCA
iterations is then executed.
The number of outer iterations and local SDCA iterations control the overall network costs, because
there is only network communication required for each outer iteration.
The local SDCA iterations are embarrassingly parallel once the individual data partitions have been
distributed across the cluster.</p>
<p>The implementation of this algorithm is based on the work of
<a href="http://arxiv.org/abs/1409.1458">Jaggi et al.</a></p>
<h2 id="operations">Operations</h2>
<p><code>SVM</code> is a <code>Predictor</code>.
As such, it supports the <code>fit</code> and <code>predict</code> operation.</p>
<h3 id="fit">Fit</h3>
<p>SVM is trained given a set of <code>LabeledVector</code>:</p>
<ul>
<li><code>fit: DataSet[LabeledVector] =&gt; Unit</code></li>
</ul>
<h3 id="predict">Predict</h3>
<p>SVM predicts for all subtypes of <code>Vector</code> the corresponding class label:</p>
<ul>
<li><code>predict[T &lt;: Vector]: DataSet[T] =&gt; DataSet[LabeledVector]</code></li>
</ul>
<p>If we call predict with a <code>DataSet[LabeledVector]</code>, we make a prediction on the class label
for each example, and return a <code>DataSet[(Double, Double)]</code>. In each tuple the first element
is the true value, as was provided from the input <code>DataSet[LabeledVector]</code> and the second element
is the predicted value. You can then use these <code>(truth, prediction)</code> tuples to evaluate
the algorithm’s performance.</p>
<ul>
<li><code>predict: DataSet[LabeledVector] =&gt; DataSet[(Double, Double)]</code></li>
</ul>
<h2 id="parameters">Parameters</h2>
<p>The SVM implementation can be controlled by the following parameters:</p>
<table class="table table-bordered">
<thead>
<tr>
<th class="text-left" style="width: 20%">Parameters</th>
<th class="text-center">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>Blocks</strong></td>
<td>
<p>
Sets the number of blocks into which the input data will be split.
On each block the local stochastic dual coordinate ascent method is executed.
This number should be set at least to the degree of parallelism.
If no value is specified, then the parallelism of the input DataSet is used as the number of blocks.
(Default value: <strong>None</strong>)
</p>
</td>
</tr>
<tr>
<td><strong>Iterations</strong></td>
<td>
<p>
Defines the maximum number of iterations of the outer loop method.
In other words, it defines how often the SDCA method is applied to the blocked data.
After each iteration, the locally computed weight vector updates have to be reduced to update the global weight vector value.
The new weight vector is broadcast to all SDCA tasks at the beginning of each iteration.
(Default value: <strong>10</strong>)
</p>
</td>
</tr>
<tr>
<td><strong>LocalIterations</strong></td>
<td>
<p>
Defines the maximum number of SDCA iterations.
In other words, it defines how many data points are drawn from each local data block to calculate the stochastic dual coordinate ascent.
(Default value: <strong>10</strong>)
</p>
</td>
</tr>
<tr>
<td><strong>Regularization</strong></td>
<td>
<p>
Defines the regularization constant of the SVM algorithm.
The higher the value, the smaller will the 2-norm of the weight vector be.
In case of a SVM with hinge loss this means that the SVM margin will be wider even though it might contain some false classifications.
(Default value: <strong>1.0</strong>)
</p>
</td>
</tr>
<tr>
<td><strong>Stepsize</strong></td>
<td>
<p>
Defines the initial step size for the updates of the weight vector.
The larger the step size is, the larger will be the contribution of the weight vector updates to the next weight vector value.
The effective scaling of the updates is $\frac{stepsize}{blocks}$.
This value has to be tuned in case that the algorithm becomes unstable.
(Default value: <strong>1.0</strong>)
</p>
</td>
</tr>
<tr>
<td><strong>Seed</strong></td>
<td>
<p>
Defines the seed to initialize the random number generator.
The seed directly controls which data points are chosen for the SDCA method.
(Default value: <strong>0</strong>)
</p>
</td>
</tr>
</tbody>
</table>
<h2 id="examples">Examples</h2>
<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="c1">// Read the training data set, from a LibSVM formatted file</span>
<span class="k">val</span> <span class="n">trainingDS</span><span class="k">:</span> <span class="kt">DataSet</span><span class="o">[</span><span class="kt">LabeledVector</span><span class="o">]</span> <span class="k">=</span> <span class="n">env</span><span class="o">.</span><span class="n">readLibSVM</span><span class="o">(</span><span class="n">pathToTrainingFile</span><span class="o">)</span>
<span class="c1">// Create the SVM learner</span>
<span class="k">val</span> <span class="n">svm</span> <span class="k">=</span> <span class="nc">SVM</span><span class="o">()</span>
<span class="o">.</span><span class="n">setBlocks</span><span class="o">(</span><span class="mi">10</span><span class="o">)</span>
<span class="o">.</span><span class="n">setIterations</span><span class="o">(</span><span class="mi">10</span><span class="o">)</span>
<span class="o">.</span><span class="n">setLocalIterations</span><span class="o">(</span><span class="mi">10</span><span class="o">)</span>
<span class="o">.</span><span class="n">setRegularization</span><span class="o">(</span><span class="mf">0.5</span><span class="o">)</span>
<span class="o">.</span><span class="n">setStepsize</span><span class="o">(</span><span class="mf">0.5</span><span class="o">)</span>
<span class="c1">// Learn the SVM model</span>
<span class="n">svm</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">trainingDS</span><span class="o">)</span>
<span class="c1">// Read the testing data set</span>
<span class="k">val</span> <span class="n">testingDS</span><span class="k">:</span> <span class="kt">DataSet</span><span class="o">[</span><span class="kt">Vector</span><span class="o">]</span> <span class="k">=</span> <span class="n">env</span><span class="o">.</span><span class="n">readVectorFile</span><span class="o">(</span><span class="n">pathToTestingFile</span><span class="o">)</span>
<span class="c1">// Calculate the predictions for the testing data set</span>
<span class="k">val</span> <span class="n">predictionDS</span><span class="k">:</span> <span class="kt">DataSet</span><span class="o">[</span><span class="kt">LabeledVector</span><span class="o">]</span> <span class="k">=</span> <span class="n">svm</span><span class="o">.</span><span class="n">predict</span><span class="o">(</span><span class="n">testingDS</span><span class="o">)</span></code></pre></div>
</div>
<div class="col-sm-10 col-sm-offset-1">
<!-- Disqus thread and some vertical offset -->
<div style="margin-top: 75px; margin-bottom: 50px" id="disqus_thread"></div>
</div>
</div>
</div><!-- /.container -->
<!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.2/jquery.min.js"></script>
<!-- Include all compiled plugins (below), or include individual files as needed -->
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script>
<script src="http://flink.apache.org/docs/master/page/js/codetabs.js"></script>
<!-- Google Analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-52545728-1', 'auto');
ga('send', 'pageview');
</script>
<!-- Disqus -->
<script type="text/javascript">
var disqus_shortname = 'stratosphere-eu';
(function() {
var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
(document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
})();
</script>
</body>
</html>