| <!-- |
| Licensed to the Apache Software Foundation (ASF) under one |
| or more contributor license agreements. See the NOTICE file |
| distributed with this work for additional information |
| regarding copyright ownership. The ASF licenses this file |
| to you under the Apache License, Version 2.0 (the |
| "License"); you may not use this file except in compliance |
| with the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, |
| software distributed under the License is distributed on an |
| "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| KIND, either express or implied. See the License for the |
| specific language governing permissions and limitations |
| under the License. |
| --> |
| <!DOCTYPE html> |
| |
| <html lang="en"> |
| <head> |
| <meta charset="utf-8"> |
| <meta http-equiv="X-UA-Compatible" content="IE=edge"> |
| <meta name="viewport" content="width=device-width, initial-scale=1"> |
| <!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags --> |
| |
| <title>Apache Flink 0.10-SNAPSHOT Documentation: FlinkML - SVM using CoCoA</title> |
| |
| <link rel="shortcut icon" href="http://flink.apache.org/docs/master/page/favicon.ico" type="image/x-icon"> |
| <link rel="icon" href="http://flink.apache.org/docs/master/page/favicon.ico" type="image/x-icon"> |
| |
| <!-- Bootstrap --> |
| <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css"> |
| <link rel="stylesheet" href="http://flink.apache.org/docs/master/page/css/flink.css"> |
| <link rel="stylesheet" href="http://flink.apache.org/docs/master/page/css/syntax.css"> |
| <link rel="stylesheet" href="http://flink.apache.org/docs/master/page/css/codetabs.css"> |
| |
| <script type="text/x-mathjax-config"> |
| MathJax.Hub.Config({ |
| tex2jax: { |
| inlineMath: [['$','$'], ['\\(','\\)']] }, |
| TeX: { |
| equationNumbers: { autoNumber: "AMS" } } |
| }); |
| </script> |
| <script type="text/javascript" |
| src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"> |
| </script> |
| |
| <!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries --> |
| <!-- WARNING: Respond.js doesn't work if you view the page via file:// --> |
| <!--[if lt IE 9]> |
| <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script> |
| <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script> |
| <![endif]--> |
| </head> |
| <body> |
| |
| |
| |
| |
| |
| |
| <!-- Top navbar. --> |
| <nav class="navbar navbar-default navbar-fixed-top"> |
| <div class="container"> |
| <!-- The logo. --> |
| <div class="navbar-header"> |
| <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1"> |
| <span class="icon-bar"></span> |
| <span class="icon-bar"></span> |
| <span class="icon-bar"></span> |
| </button> |
| <div class="navbar-logo"> |
| <a href="http://flink.apache.org"><img alt="Apache Flink" src="http://flink.apache.org/docs/master/page/img/navbar-brand-logo.jpg"></a> |
| </div> |
| </div><!-- /.navbar-header --> |
| |
| <!-- The navigation links. --> |
| <div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1"> |
| <ul class="nav navbar-nav"> |
| <li><a href="http://flink.apache.org/docs/master/index.html">Overview<span class="hidden-sm hidden-xs"> 0.10</span></a></li> |
| |
| <!-- Setup --> |
| <li class="dropdown"> |
| <a href="http://flink.apache.org/docs/master/setup" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Setup <span class="caret"></span></a> |
| <ul class="dropdown-menu" role="menu"> |
| <li><a href="http://flink.apache.org/docs/master/setup/building.html">Get Flink 0.10-SNAPSHOT</a></li> |
| |
| <li class="divider"></li> |
| <li role="presentation" class="dropdown-header"><strong>Deployment</strong></li> |
| <li><a href="http://flink.apache.org/docs/master/setup/local_setup.html" class="active">Local</a></li> |
| <li><a href="http://flink.apache.org/docs/master/setup/cluster_setup.html">Cluster (Standalone)</a></li> |
| <li><a href="http://flink.apache.org/docs/master/setup/yarn_setup.html">YARN</a></li> |
| <li><a href="http://flink.apache.org/docs/master/setup/gce_setup.html">GCloud</a></li> |
| <li><a href="http://flink.apache.org/docs/master/setup/flink_on_tez.html">Flink on Tez <span class="badge">Beta</span></a></li> |
| |
| <li class="divider"></li> |
| <li><a href="http://flink.apache.org/docs/master/setup/config.html">Configuration</a></li> |
| </ul> |
| </li> |
| |
| <!-- Programming Guides --> |
| <li class="dropdown"> |
| <a href="http://flink.apache.org/docs/master/apis" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Programming Guides <span class="caret"></span></a> |
| <ul class="dropdown-menu" role="menu"> |
| <li><a href="http://flink.apache.org/docs/master/apis/programming_guide.html"><strong>Batch: DataSet API</strong></a></li> |
| <li><a href="http://flink.apache.org/docs/master/apis/streaming_guide.html"><strong>Streaming: DataStream API</strong> <span class="badge">Beta</span></a></li> |
| <li><a href="http://flink.apache.org/docs/master/apis/python.html">Python API <span class="badge">Beta</span></a></li> |
| |
| <li class="divider"></li> |
| <li><a href="scala_shell.html">Interactive Scala Shell</a></li> |
| <li><a href="http://flink.apache.org/docs/master/apis/dataset_transformations.html">Dataset Transformations</a></li> |
| <li><a href="http://flink.apache.org/docs/master/apis/best_practices.html">Best Practices</a></li> |
| <li><a href="http://flink.apache.org/docs/master/apis/example_connectors.html">Connectors</a></li> |
| <li><a href="http://flink.apache.org/docs/master/apis/examples.html">Examples</a></li> |
| <li><a href="http://flink.apache.org/docs/master/apis/local_execution.html">Local Execution</a></li> |
| <li><a href="http://flink.apache.org/docs/master/apis/cluster_execution.html">Cluster Execution</a></li> |
| <li><a href="http://flink.apache.org/docs/master/apis/cli.html">Command Line Interface</a></li> |
| <li><a href="http://flink.apache.org/docs/master/apis/web_client.html">Web Client</a></li> |
| <li><a href="http://flink.apache.org/docs/master/apis/iterations.html">Iterations</a></li> |
| <li><a href="http://flink.apache.org/docs/master/apis/java8.html">Java 8</a></li> |
| <li><a href="http://flink.apache.org/docs/master/apis/hadoop_compatibility.html">Hadoop Compatability <span class="badge">Beta</span></a></li> |
| </ul> |
| </li> |
| |
| <!-- Libraries --> |
| <li class="dropdown"> |
| <a href="http://flink.apache.org/docs/master/libs" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Libraries <span class="caret"></span></a> |
| <ul class="dropdown-menu" role="menu"> |
| <li><a href="http://flink.apache.org/docs/master/libs/spargel_guide.html">Graphs: Spargel</a></li> |
| <li><a href="http://flink.apache.org/docs/master/libs/gelly_guide.html">Graphs: Gelly <span class="badge">Beta</span></a></li> |
| <li><a href="http://flink.apache.org/docs/master/libs/ml/">Machine Learning <span class="badge">Beta</span></a></li> |
| <li><a href="http://flink.apache.org/docs/master/libs/table.html">Relational: Table <span class="badge">Beta</span></a></li> |
| </ul> |
| </li> |
| |
| <!-- Internals --> |
| <li class="dropdown"> |
| <a href="http://flink.apache.org/docs/master/internals" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Internals <span class="caret"></span></a> |
| <ul class="dropdown-menu" role="menu"> |
| <li role="presentation" class="dropdown-header"><strong>Contribute</strong></li> |
| <li><a href="http://flink.apache.org/docs/master/internals/how_to_contribute.html">How to Contribute</a></li> |
| <li><a href="http://flink.apache.org/docs/master/internals/coding_guidelines.html">Coding Guidelines</a></li> |
| <li><a href="http://flink.apache.org/docs/master/internals/ide_setup.html">IDE Setup</a></li> |
| <li><a href="http://flink.apache.org/docs/master/internals/logging.html">Logging</a></li> |
| <li class="divider"></li> |
| <li role="presentation" class="dropdown-header"><strong>Internals</strong></li> |
| <li><a href="http://flink.apache.org/docs/master/internals/general_arch.html">Architecture & Process Model</a></li> |
| <li><a href="http://flink.apache.org/docs/master/internals/types_serialization.html">Type Extraction & Serialization</a></li> |
| <li><a href="http://flink.apache.org/docs/master/internals/job_scheduling.html">Jobs & Scheduling</a></li> |
| <li><a href="http://flink.apache.org/docs/master/internals/add_operator.html">How-To: Add an Operator</a></li> |
| </ul> |
| </li> |
| </ul> |
| <form class="navbar-form navbar-right hidden-sm hidden-md" role="search" action="http://flink.apache.org/docs/master/search-results.html"> |
| <div class="form-group"> |
| <input type="text" class="form-control" name="q" placeholder="Search all pages"> |
| </div> |
| <button type="submit" class="btn btn-default">Search</button> |
| </form> |
| </div><!-- /.navbar-collapse --> |
| </div><!-- /.container --> |
| </nav> |
| |
| |
| |
| |
| |
| <!--Some of the Latex math notation has been adapted from Apache Spark MLlib's documentation--> |
| $$ |
| \newcommand{\R}{\mathbb{R}} |
| \newcommand{\E}{\mathbb{E}} |
| \newcommand{\x}{\mathbf{x}} |
| \newcommand{\y}{\mathbf{y}} |
| \newcommand{\wv}{\mathbf{w}} |
| \newcommand{\av}{\mathbf{\alpha}} |
| \newcommand{\bv}{\mathbf{b}} |
| \newcommand{\N}{\mathbb{N}} |
| \newcommand{\id}{\mathbf{I}} |
| \newcommand{\ind}{\mathbf{1}} |
| \newcommand{\0}{\mathbf{0}} |
| \newcommand{\unit}{\mathbf{e}} |
| \newcommand{\one}{\mathbf{1}} |
| \newcommand{\zero}{\mathbf{0}} |
| \newcommand\rfrac[2]{^{#1}\!/_{#2}} |
| \newcommand{\norm}[1]{\left\lVert#1\right\rVert} |
| $$ |
| |
| |
| <!-- Main content. --> |
| <div class="container"> |
| |
| |
| <div class="row"> |
| <div class="col-sm-10 col-sm-offset-1"> |
| <h1><a href="../ml">FlinkML</a> - SVM using CoCoA</h1> |
| |
| |
| |
| <ul id="markdown-toc"> |
| <li><a href="#description" id="markdown-toc-description">Description</a></li> |
| <li><a href="#operations" id="markdown-toc-operations">Operations</a> <ul> |
| <li><a href="#fit" id="markdown-toc-fit">Fit</a></li> |
| <li><a href="#predict" id="markdown-toc-predict">Predict</a></li> |
| </ul> |
| </li> |
| <li><a href="#parameters" id="markdown-toc-parameters">Parameters</a></li> |
| <li><a href="#examples" id="markdown-toc-examples">Examples</a></li> |
| </ul> |
| |
| <h2 id="description">Description</h2> |
| |
| <p>Implements an SVM with soft-margin using the communication-efficient distributed dual coordinate |
| ascent algorithm with hinge-loss function. |
| The algorithm solves the following minimization problem:</p> |
| |
| <script type="math/tex; mode=display">\min_{\mathbf{w} \in \mathbb{R}^d} \frac{\lambda}{2} \left\lVert \mathbf{w} \right\rVert^2 + \frac{1}{n} \sum_{i=1}^n l_{i}\left(\mathbf{w}^T\mathbf{x}_i\right)</script> |
| |
| <p>with $\mathbf{w}$ being the weight vector, $\lambda$ being the regularization constant, |
| <script type="math/tex">\mathbf{x}_i \in \mathbb{R}^d</script> being the data points and <script type="math/tex">l_{i}</script> being the convex loss |
| functions, which can also depend on the labels <script type="math/tex">y_{i} \in \mathbb{R}</script>. |
| In the current implementation the regularizer is the $\ell_2$-norm and the loss functions are the hinge-loss functions:</p> |
| |
| <script type="math/tex; mode=display">l_{i} = \max\left(0, 1 - y_{i} \mathbf{w}^T\mathbf{x}_i \right)</script> |
| |
| <p>With these choices, the problem definition is equivalent to a SVM with soft-margin. |
| Thus, the algorithm allows us to train a SVM with soft-margin.</p> |
| |
| <p>The minimization problem is solved by applying stochastic dual coordinate ascent (SDCA). |
| In order to make the algorithm efficient in a distributed setting, the CoCoA algorithm calculates |
| several iterations of SDCA locally on a data block before merging the local updates into a |
| valid global state. |
| This state is redistributed to the different data partitions where the next round of local SDCA |
| iterations is then executed. |
| The number of outer iterations and local SDCA iterations control the overall network costs, because |
| there is only network communication required for each outer iteration. |
| The local SDCA iterations are embarrassingly parallel once the individual data partitions have been |
| distributed across the cluster.</p> |
| |
| <p>The implementation of this algorithm is based on the work of |
| <a href="http://arxiv.org/abs/1409.1458">Jaggi et al.</a></p> |
| |
| <h2 id="operations">Operations</h2> |
| |
| <p><code>SVM</code> is a <code>Predictor</code>. |
| As such, it supports the <code>fit</code> and <code>predict</code> operation.</p> |
| |
| <h3 id="fit">Fit</h3> |
| |
| <p>SVM is trained given a set of <code>LabeledVector</code>:</p> |
| |
| <ul> |
| <li><code>fit: DataSet[LabeledVector] => Unit</code></li> |
| </ul> |
| |
| <h3 id="predict">Predict</h3> |
| |
| <p>SVM predicts for all subtypes of <code>Vector</code> the corresponding class label:</p> |
| |
| <ul> |
| <li><code>predict[T <: Vector]: DataSet[T] => DataSet[LabeledVector]</code></li> |
| </ul> |
| |
| <p>If we call predict with a <code>DataSet[LabeledVector]</code>, we make a prediction on the class label |
| for each example, and return a <code>DataSet[(Double, Double)]</code>. In each tuple the first element |
| is the true value, as was provided from the input <code>DataSet[LabeledVector]</code> and the second element |
| is the predicted value. You can then use these <code>(truth, prediction)</code> tuples to evaluate |
| the algorithm’s performance.</p> |
| |
| <ul> |
| <li><code>predict: DataSet[LabeledVector] => DataSet[(Double, Double)]</code></li> |
| </ul> |
| |
| <h2 id="parameters">Parameters</h2> |
| |
| <p>The SVM implementation can be controlled by the following parameters:</p> |
| |
| <table class="table table-bordered"> |
| <thead> |
| <tr> |
| <th class="text-left" style="width: 20%">Parameters</th> |
| <th class="text-center">Description</th> |
| </tr> |
| </thead> |
| |
| <tbody> |
| <tr> |
| <td><strong>Blocks</strong></td> |
| <td> |
| <p> |
| Sets the number of blocks into which the input data will be split. |
| On each block the local stochastic dual coordinate ascent method is executed. |
| This number should be set at least to the degree of parallelism. |
| If no value is specified, then the parallelism of the input DataSet is used as the number of blocks. |
| (Default value: <strong>None</strong>) |
| </p> |
| </td> |
| </tr> |
| <tr> |
| <td><strong>Iterations</strong></td> |
| <td> |
| <p> |
| Defines the maximum number of iterations of the outer loop method. |
| In other words, it defines how often the SDCA method is applied to the blocked data. |
| After each iteration, the locally computed weight vector updates have to be reduced to update the global weight vector value. |
| The new weight vector is broadcast to all SDCA tasks at the beginning of each iteration. |
| (Default value: <strong>10</strong>) |
| </p> |
| </td> |
| </tr> |
| <tr> |
| <td><strong>LocalIterations</strong></td> |
| <td> |
| <p> |
| Defines the maximum number of SDCA iterations. |
| In other words, it defines how many data points are drawn from each local data block to calculate the stochastic dual coordinate ascent. |
| (Default value: <strong>10</strong>) |
| </p> |
| </td> |
| </tr> |
| <tr> |
| <td><strong>Regularization</strong></td> |
| <td> |
| <p> |
| Defines the regularization constant of the SVM algorithm. |
| The higher the value, the smaller will the 2-norm of the weight vector be. |
| In case of a SVM with hinge loss this means that the SVM margin will be wider even though it might contain some false classifications. |
| (Default value: <strong>1.0</strong>) |
| </p> |
| </td> |
| </tr> |
| <tr> |
| <td><strong>Stepsize</strong></td> |
| <td> |
| <p> |
| Defines the initial step size for the updates of the weight vector. |
| The larger the step size is, the larger will be the contribution of the weight vector updates to the next weight vector value. |
| The effective scaling of the updates is $\frac{stepsize}{blocks}$. |
| This value has to be tuned in case that the algorithm becomes unstable. |
| (Default value: <strong>1.0</strong>) |
| </p> |
| </td> |
| </tr> |
| <tr> |
| <td><strong>Seed</strong></td> |
| <td> |
| <p> |
| Defines the seed to initialize the random number generator. |
| The seed directly controls which data points are chosen for the SDCA method. |
| (Default value: <strong>0</strong>) |
| </p> |
| </td> |
| </tr> |
| </tbody> |
| </table> |
| |
| <h2 id="examples">Examples</h2> |
| |
| <div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="c1">// Read the training data set, from a LibSVM formatted file</span> |
| <span class="k">val</span> <span class="n">trainingDS</span><span class="k">:</span> <span class="kt">DataSet</span><span class="o">[</span><span class="kt">LabeledVector</span><span class="o">]</span> <span class="k">=</span> <span class="n">env</span><span class="o">.</span><span class="n">readLibSVM</span><span class="o">(</span><span class="n">pathToTrainingFile</span><span class="o">)</span> |
| |
| <span class="c1">// Create the SVM learner</span> |
| <span class="k">val</span> <span class="n">svm</span> <span class="k">=</span> <span class="nc">SVM</span><span class="o">()</span> |
| <span class="o">.</span><span class="n">setBlocks</span><span class="o">(</span><span class="mi">10</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setIterations</span><span class="o">(</span><span class="mi">10</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setLocalIterations</span><span class="o">(</span><span class="mi">10</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setRegularization</span><span class="o">(</span><span class="mf">0.5</span><span class="o">)</span> |
| <span class="o">.</span><span class="n">setStepsize</span><span class="o">(</span><span class="mf">0.5</span><span class="o">)</span> |
| |
| <span class="c1">// Learn the SVM model</span> |
| <span class="n">svm</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">trainingDS</span><span class="o">)</span> |
| |
| <span class="c1">// Read the testing data set</span> |
| <span class="k">val</span> <span class="n">testingDS</span><span class="k">:</span> <span class="kt">DataSet</span><span class="o">[</span><span class="kt">Vector</span><span class="o">]</span> <span class="k">=</span> <span class="n">env</span><span class="o">.</span><span class="n">readVectorFile</span><span class="o">(</span><span class="n">pathToTestingFile</span><span class="o">)</span> |
| |
| <span class="c1">// Calculate the predictions for the testing data set</span> |
| <span class="k">val</span> <span class="n">predictionDS</span><span class="k">:</span> <span class="kt">DataSet</span><span class="o">[</span><span class="kt">LabeledVector</span><span class="o">]</span> <span class="k">=</span> <span class="n">svm</span><span class="o">.</span><span class="n">predict</span><span class="o">(</span><span class="n">testingDS</span><span class="o">)</span></code></pre></div> |
| |
| |
| </div> |
| |
| <div class="col-sm-10 col-sm-offset-1"> |
| <!-- Disqus thread and some vertical offset --> |
| <div style="margin-top: 75px; margin-bottom: 50px" id="disqus_thread"></div> |
| </div> |
| </div> |
| |
| </div><!-- /.container --> |
| |
| <!-- jQuery (necessary for Bootstrap's JavaScript plugins) --> |
| <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.2/jquery.min.js"></script> |
| <!-- Include all compiled plugins (below), or include individual files as needed --> |
| <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script> |
| <script src="http://flink.apache.org/docs/master/page/js/codetabs.js"></script> |
| |
| <!-- Google Analytics --> |
| <script> |
| (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ |
| (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), |
| m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) |
| })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); |
| |
| ga('create', 'UA-52545728-1', 'auto'); |
| ga('send', 'pageview'); |
| </script> |
| |
| <!-- Disqus --> |
| <script type="text/javascript"> |
| var disqus_shortname = 'stratosphere-eu'; |
| (function() { |
| var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true; |
| dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js'; |
| (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq); |
| })(); |
| </script> |
| </body> |
| </html> |