blob: bdec003dc5086f79f82d77f3ad828570c9269eb6 [file] [log] [blame]
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
<title>Apache Flink 0.9.0 Documentation: FlinkML - Optimization</title>
<link rel="shortcut icon" href="http://flink.apache.org/docs/0.9/page/favicon.ico" type="image/x-icon">
<link rel="icon" href="http://flink.apache.org/docs/0.9/page/favicon.ico" type="image/x-icon">
<!-- Bootstrap -->
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css">
<link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/flink.css">
<link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/syntax.css">
<link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/codetabs.css">
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
tex2jax: {
inlineMath: [['$','$'], ['\\(','\\)']] },
TeX: {
equationNumbers: { autoNumber: "AMS" } }
});
</script>
<script type="text/javascript"
src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
</script>
<!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
<!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
<!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]-->
</head>
<body>
<!-- Top navbar. -->
<nav class="navbar navbar-default navbar-fixed-top">
<div class="container">
<!-- The logo. -->
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1">
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<div class="navbar-logo">
<a href="http://flink.apache.org"><img alt="Apache Flink" src="http://flink.apache.org/docs/0.9/page/img/navbar-brand-logo.jpg"></a>
</div>
</div><!-- /.navbar-header -->
<!-- The navigation links. -->
<div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1">
<ul class="nav navbar-nav">
<li><a href="http://flink.apache.org/docs/0.9/index.html">Overview<span class="hidden-sm hidden-xs"> 0.9.0</span></a></li>
<!-- Setup -->
<li class="dropdown">
<a href="http://flink.apache.org/docs/0.9/setup" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Setup <span class="caret"></span></a>
<ul class="dropdown-menu" role="menu">
<li><a href="http://flink.apache.org/docs/0.9/setup/building.html">Get Flink 0.9-SNAPSHOT</a></li>
<li class="divider"></li>
<li role="presentation" class="dropdown-header"><strong>Deployment</strong></li>
<li><a href="http://flink.apache.org/docs/0.9/setup/local_setup.html" class="active">Local</a></li>
<li><a href="http://flink.apache.org/docs/0.9/setup/cluster_setup.html">Cluster (Standalone)</a></li>
<li><a href="http://flink.apache.org/docs/0.9/setup/yarn_setup.html">YARN</a></li>
<li><a href="http://flink.apache.org/docs/0.9/setup/gce_setup.html">GCloud</a></li>
<li><a href="http://flink.apache.org/docs/0.9/setup/flink_on_tez.html">Flink on Tez <span class="badge">Beta</span></a></li>
<li class="divider"></li>
<li><a href="http://flink.apache.org/docs/0.9/setup/config.html">Configuration</a></li>
</ul>
</li>
<!-- Programming Guides -->
<li class="dropdown">
<a href="http://flink.apache.org/docs/0.9/apis" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Programming Guides <span class="caret"></span></a>
<ul class="dropdown-menu" role="menu">
<li><a href="http://flink.apache.org/docs/0.9/apis/programming_guide.html"><strong>Batch: DataSet API</strong></a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/streaming_guide.html"><strong>Streaming: DataStream API</strong> <span class="badge">Beta</span></a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/python.html">Python API <span class="badge">Beta</span></a></li>
<li class="divider"></li>
<li><a href="scala_shell.html">Interactive Scala Shell</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/dataset_transformations.html">Dataset Transformations</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/best_practices.html">Best Practices</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/example_connectors.html">Connectors</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/examples.html">Examples</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/local_execution.html">Local Execution</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/cluster_execution.html">Cluster Execution</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/cli.html">Command Line Interface</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/web_client.html">Web Client</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/iterations.html">Iterations</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/java8.html">Java 8</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/hadoop_compatibility.html">Hadoop Compatability <span class="badge">Beta</span></a></li>
</ul>
</li>
<!-- Libraries -->
<li class="dropdown">
<a href="http://flink.apache.org/docs/0.9/libs" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Libraries <span class="caret"></span></a>
<ul class="dropdown-menu" role="menu">
<li><a href="http://flink.apache.org/docs/0.9/libs/spargel_guide.html">Graphs: Spargel</a></li>
<li><a href="http://flink.apache.org/docs/0.9/libs/gelly_guide.html">Graphs: Gelly <span class="badge">Beta</span></a></li>
<li><a href="http://flink.apache.org/docs/0.9/libs/ml/">Machine Learning <span class="badge">Beta</span></a></li>
<li><a href="http://flink.apache.org/docs/0.9/libs/table.html">Relational: Table <span class="badge">Beta</span></a></li>
</ul>
</li>
<!-- Internals -->
<li class="dropdown">
<a href="http://flink.apache.org/docs/0.9/internals" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Internals <span class="caret"></span></a>
<ul class="dropdown-menu" role="menu">
<li role="presentation" class="dropdown-header"><strong>Contribute</strong></li>
<li><a href="http://flink.apache.org/docs/0.9/internals/how_to_contribute.html">How to Contribute</a></li>
<li><a href="http://flink.apache.org/docs/0.9/internals/coding_guidelines.html">Coding Guidelines</a></li>
<li><a href="http://flink.apache.org/docs/0.9/internals/ide_setup.html">IDE Setup</a></li>
<li><a href="http://flink.apache.org/docs/0.9/internals/logging.html">Logging</a></li>
<li class="divider"></li>
<li role="presentation" class="dropdown-header"><strong>Internals</strong></li>
<li><a href="http://flink.apache.org/docs/0.9/internals/general_arch.html">Architecture &amp; Process Model</a></li>
<li><a href="http://flink.apache.org/docs/0.9/internals/types_serialization.html">Type Extraction &amp; Serialization</a></li>
<li><a href="http://flink.apache.org/docs/0.9/internals/job_scheduling.html">Jobs &amp; Scheduling</a></li>
<li><a href="http://flink.apache.org/docs/0.9/internals/add_operator.html">How-To: Add an Operator</a></li>
</ul>
</li>
</ul>
<form class="navbar-form navbar-right hidden-sm hidden-md" role="search" action="http://flink.apache.org/docs/0.9/search-results.html">
<div class="form-group">
<input type="text" class="form-control" name="q" placeholder="Search all pages">
</div>
<button type="submit" class="btn btn-default">Search</button>
</form>
</div><!-- /.navbar-collapse -->
</div><!-- /.container -->
</nav>
<!--Some of the Latex math notation has been adapted from Apache Spark MLlib's documentation-->
$$
\newcommand{\R}{\mathbb{R}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\newcommand{\wv}{\mathbf{w}}
\newcommand{\av}{\mathbf{\alpha}}
\newcommand{\bv}{\mathbf{b}}
\newcommand{\N}{\mathbb{N}}
\newcommand{\id}{\mathbf{I}}
\newcommand{\ind}{\mathbf{1}}
\newcommand{\0}{\mathbf{0}}
\newcommand{\unit}{\mathbf{e}}
\newcommand{\one}{\mathbf{1}}
\newcommand{\zero}{\mathbf{0}}
\newcommand\rfrac[2]{^{#1}\!/_{#2}}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
$$
<!-- Main content. -->
<div class="container">
<div class="row">
<div class="col-sm-10 col-sm-offset-1">
<h1><a href="../ml">FlinkML</a> - Optimization</h1>
<ul id="markdown-toc">
<li><a href="#mathematical-formulation" id="markdown-toc-mathematical-formulation">Mathematical Formulation</a> <ul>
<li><a href="#loss-functions" id="markdown-toc-loss-functions">Loss Functions</a></li>
<li><a href="#regularization-types" id="markdown-toc-regularization-types">Regularization Types</a></li>
</ul>
</li>
<li><a href="#stochastic-gradient-descent" id="markdown-toc-stochastic-gradient-descent">Stochastic Gradient Descent</a> <ul>
<li><a href="#regularization" id="markdown-toc-regularization">Regularization</a></li>
<li><a href="#parameters" id="markdown-toc-parameters">Parameters</a></li>
<li><a href="#loss-function" id="markdown-toc-loss-function">Loss Function</a></li>
<li><a href="#examples" id="markdown-toc-examples">Examples</a></li>
</ul>
</li>
</ul>
<h2 id="mathematical-formulation">Mathematical Formulation</h2>
<p>The optimization framework in FlinkML is a developer-oriented package that can be used to solve
<a href="https://en.wikipedia.org/wiki/Mathematical_optimization">optimization</a>
problems common in Machine Learning (ML) tasks. In the supervised learning context, this usually
involves finding a model, as defined by a set of parameters $w$, that minimize a function $f(\wv)$
given a set of $(\x, y)$ examples,
where $\x$ is a feature vector and $y$ is a real number, which can represent either a real value in
the regression case, or a class label in the classification case. In supervised learning, the
function to be minimized is usually of the form:</p>
<p>\begin{equation} \label{eq:objectiveFunc}
f(\wv) :=
\frac1n \sum_{i=1}^n L(\wv;\x_i,y_i) +
\lambda\, R(\wv)
\ .
\end{equation}</p>
<p>where $L$ is the loss function and $R(\wv)$ the regularization penalty. We use $L$ to measure how
well the model fits the observed data, and we use $R$ in order to impose a complexity cost to the
model, with $\lambda &gt; 0$ being the regularization parameter.</p>
<h3 id="loss-functions">Loss Functions</h3>
<p>In supervised learning, we use loss functions in order to measure the model fit, by
penalizing errors in the predictions $p$ made by the model compared to the true $y$ for each
example. Different loss functions can be used for regression (e.g. Squared Loss) and classification
(e.g. Hinge Loss) tasks.</p>
<p>Some common loss functions are:</p>
<ul>
<li>Squared Loss: $ \frac{1}{2} \left(\wv^T \cdot \x - y\right)^2, \quad y \in \R $</li>
<li>Hinge Loss: $ \max \left(0, 1 - y ~ \wv^T \cdot \x\right), \quad y \in {-1, +1} $</li>
<li>Logistic Loss: $ \log\left(1+\exp\left( -y ~ \wv^T \cdot \x\right)\right), \quad y \in {-1, +1}$</li>
</ul>
<h3 id="regularization-types">Regularization Types</h3>
<p><a href="https://en.wikipedia.org/wiki/Regularization_(mathematics)">Regularization</a> in machine learning
imposes penalties to the estimated models, in order to reduce overfitting. The most common penalties
are the $L_1$ and $L_2$ penalties, defined as:</p>
<ul>
<li>$L_1$: $R(\wv) = \norm{\wv}_1$</li>
<li>$L_2$: $R(\wv) = \frac{1}{2}\norm{\wv}_2^2$</li>
</ul>
<p>The $L_2$ penalty penalizes large weights, favoring solutions with more small weights rather than
few large ones.
The $L_1$ penalty can be used to drive a number of the solution coefficients to 0, thereby
producing sparse solutions.
The regularization constant $\lambda$ in $\eqref{eq:objectiveFunc}$ determines the amount of regularization applied to the model,
and is usually determined through model cross-validation.
A good comparison of regularization types can be found in <a href="http://www.robotics.stanford.edu/~ang/papers/icml04-l1l2.pdf">this</a> paper by Andrew Ng.
Which regularization type is supported depends on the actually used optimization algorithm.</p>
<h2 id="stochastic-gradient-descent">Stochastic Gradient Descent</h2>
<p>In order to find a (local) minimum of a function, Gradient Descent methods take steps in the
direction opposite to the gradient of the function $\eqref{eq:objectiveFunc}$ taken with
respect to the current parameters (weights).
In order to compute the exact gradient we need to perform one pass through all the points in
a dataset, making the process computationally expensive.
An alternative is Stochastic Gradient Descent (SGD) where at each iteration we sample one point
from the complete dataset and update the parameters for each point, in an online manner.</p>
<p>In mini-batch SGD we instead sample random subsets of the dataset, and compute the gradient
over each batch. At each iteration of the algorithm we update the weights once, based on
the average of the gradients computed from each mini-batch.</p>
<p>An important parameter is the learning rate $\eta$, or step size, which is currently determined as
$\eta = \eta_0/\sqrt{j}$, where $\eta_0$ is the initial step size and $j$ is the iteration
number. The setting of the initial step size can significantly affect the performance of the
algorithm. For some practical tips on tuning SGD see Leon Botou’s
<a href="http://research.microsoft.com/pubs/192769/tricks-2012.pdf">Stochastic Gradient Descent Tricks</a>”.</p>
<p>The current implementation of SGD uses the whole partition, making it
effectively a batch gradient descent. Once a sampling operator has been introduced in Flink, true
mini-batch SGD will be performed.</p>
<h3 id="regularization">Regularization</h3>
<p>FlinkML supports Stochastic Gradient Descent with L1, L2 and no regularization.
The following list contains a mapping between the implementing classes and the regularization function.</p>
<table class="table table-bordered">
<thead>
<tr>
<th class="text-left" style="width: 20%">Class Name</th>
<th class="text-center">Regularization function $R(\wv)$</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>SimpleGradient</code></td>
<td>$R(\wv) = 0$</td>
</tr>
<tr>
<td><code>GradientDescentL1</code></td>
<td>$R(\wv) = \norm{\wv}_1$</td>
</tr>
<tr>
<td><code>GradientDescentL2</code></td>
<td>$R(\wv) = \frac{1}{2}\norm{\wv}_2^2$</td>
</tr>
</tbody>
</table>
<h3 id="parameters">Parameters</h3>
<p>The stochastic gradient descent implementation can be controlled by the following parameters:</p>
<table class="table table-bordered">
<thead>
<tr>
<th class="text-left" style="width: 20%">Parameter</th>
<th class="text-center">Description</th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>LossFunction</strong></td>
<td>
<p>
The loss function to be optimized. (Default value: <strong>None</strong>)
</p>
</td>
</tr>
<tr>
<td><strong>RegularizationConstant</strong></td>
<td>
<p>
The amount of regularization to apply. (Default value: <strong>0.0</strong>)
</p>
</td>
</tr>
<tr>
<td><strong>Iterations</strong></td>
<td>
<p>
The maximum number of iterations. (Default value: <strong>10</strong>)
</p>
</td>
</tr>
<tr>
<td><strong>LearningRate</strong></td>
<td>
<p>
Initial learning rate for the gradient descent method.
This value controls how far the gradient descent method moves in the opposite direction
of the gradient.
(Default value: <strong>0.1</strong>)
</p>
</td>
</tr>
<tr>
<td><strong>ConvergenceThreshold</strong></td>
<td>
<p>
When set, iterations stop if the relative change in the value of the objective function $\eqref{eq:objectiveFunc}$ is less than the provided threshold, $\tau$.
The convergence criterion is defined as follows: $\left| \frac{f(\wv)_{i-1} - f(\wv)_i}{f(\wv)_{i-1}}\right| &lt; \tau$.
(Default value: <strong>None</strong>)
</p>
</td>
</tr>
</tbody>
</table>
<h3 id="loss-function">Loss Function</h3>
<p>The loss function which is minimized has to implement the <code>LossFunction</code> interface, which defines methods to compute the loss and the gradient of it.
Either one defines ones own <code>LossFunction</code> or one uses the <code>GenericLossFunction</code> class which constructs the loss function from an outer loss function and a prediction function.
An example can be seen here</p>
<p><code>Scala
val lossFunction = GenericLossFunction(SquaredLoss, LinearPrediction)
</code></p>
<p>The full list of supported outer loss functions can be found <a href="#partial-loss-function-values">here</a>.
The full list of supported prediction functions can be found <a href="#prediction-function-values">here</a>.</p>
<h4 id="partial-loss-function-values">Partial Loss Function Values</h4>
<table class="table table-bordered">
<thead>
<tr>
<th class="text-left" style="width: 20%">Function Name</th>
<th class="text-center">Description</th>
<th class="text-center">Loss</th>
<th class="text-center">Loss Derivative</th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>SquaredLoss</strong></td>
<td>
<p>
Loss function most commonly used for regression tasks.
</p>
</td>
<td class="text-center">$\frac{1}{2} (\wv^T \cdot \x - y)^2$</td>
<td class="text-center">$\wv^T \cdot \x - y$</td>
</tr>
</tbody>
</table>
<h4 id="prediction-function-values">Prediction Function Values</h4>
<table class="table table-bordered">
<thead>
<tr>
<th class="text-left" style="width: 20%">Function Name</th>
<th class="text-center">Description</th>
<th class="text-center">Prediction</th>
<th class="text-center">Prediction Gradient</th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>LinearPrediction</strong></td>
<td>
<p>
The function most commonly used for linear models, such as linear regression and
linear classifiers.
</p>
</td>
<td class="text-center">$\x^T \cdot \wv$</td>
<td class="text-center">$\x$</td>
</tr>
</tbody>
</table>
<h3 id="examples">Examples</h3>
<p>In the Flink implementation of SGD, given a set of examples in a <code>DataSet[LabeledVector]</code> and
optionally some initial weights, we can use <code>GradientDescentL1.optimize()</code> in order to optimize
the weights for the given data.</p>
<p>The user can provide an initial <code>DataSet[WeightVector]</code>,
which contains one <code>WeightVector</code> element, or use the default weights which are all set to 0.
A <code>WeightVector</code> is a container class for the weights, which separates the intercept from the
weight vector. This allows us to avoid applying regularization to the intercept.</p>
<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="c1">// Create stochastic gradient descent solver</span>
<span class="k">val</span> <span class="n">sgd</span> <span class="k">=</span> <span class="nc">GradientDescentL1</span><span class="o">()</span>
<span class="o">.</span><span class="n">setLossFunction</span><span class="o">(</span><span class="nc">SquaredLoss</span><span class="o">())</span>
<span class="o">.</span><span class="n">setRegularizationConstant</span><span class="o">(</span><span class="mf">0.2</span><span class="o">)</span>
<span class="o">.</span><span class="n">setIterations</span><span class="o">(</span><span class="mi">100</span><span class="o">)</span>
<span class="o">.</span><span class="n">setLearningRate</span><span class="o">(</span><span class="mf">0.01</span><span class="o">)</span>
<span class="c1">// Obtain data</span>
<span class="k">val</span> <span class="n">trainingDS</span><span class="k">:</span> <span class="kt">DataSet</span><span class="o">[</span><span class="kt">LabeledVector</span><span class="o">]</span> <span class="k">=</span> <span class="o">...</span>
<span class="c1">// Optimize the weights, according to the provided data</span>
<span class="k">val</span> <span class="n">weightDS</span> <span class="k">=</span> <span class="n">sgd</span><span class="o">.</span><span class="n">optimize</span><span class="o">(</span><span class="n">trainingDS</span><span class="o">)</span></code></pre></div>
</div>
<div class="col-sm-10 col-sm-offset-1">
<!-- Disqus thread and some vertical offset -->
<div style="margin-top: 75px; margin-bottom: 50px" id="disqus_thread"></div>
</div>
</div>
</div><!-- /.container -->
<!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.2/jquery.min.js"></script>
<!-- Include all compiled plugins (below), or include individual files as needed -->
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script>
<script src="http://flink.apache.org/docs/0.9/page/js/codetabs.js"></script>
<!-- Google Analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-52545728-1', 'auto');
ga('send', 'pageview');
</script>
<!-- Disqus -->
<script type="text/javascript">
var disqus_shortname = 'stratosphere-eu';
(function() {
var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
(document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
})();
</script>
</body>
</html>