blob: 188864bd15158e93138600fe7315f1a848199723 [file] [log] [blame]
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags -->
<title>Apache Flink 0.9.0 Documentation: FlinkML - Quickstart Guide</title>
<link rel="shortcut icon" href="http://flink.apache.org/docs/0.9/page/favicon.ico" type="image/x-icon">
<link rel="icon" href="http://flink.apache.org/docs/0.9/page/favicon.ico" type="image/x-icon">
<!-- Bootstrap -->
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css">
<link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/flink.css">
<link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/syntax.css">
<link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/codetabs.css">
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
tex2jax: {
inlineMath: [['$','$'], ['\\(','\\)']] },
TeX: {
equationNumbers: { autoNumber: "AMS" } }
});
</script>
<script type="text/javascript"
src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML">
</script>
<!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries -->
<!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
<!--[if lt IE 9]>
<script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
<script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
<![endif]-->
</head>
<body>
<!-- Top navbar. -->
<nav class="navbar navbar-default navbar-fixed-top">
<div class="container">
<!-- The logo. -->
<div class="navbar-header">
<button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1">
<span class="icon-bar"></span>
<span class="icon-bar"></span>
<span class="icon-bar"></span>
</button>
<div class="navbar-logo">
<a href="http://flink.apache.org"><img alt="Apache Flink" src="http://flink.apache.org/docs/0.9/page/img/navbar-brand-logo.jpg"></a>
</div>
</div><!-- /.navbar-header -->
<!-- The navigation links. -->
<div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1">
<ul class="nav navbar-nav">
<li><a href="http://flink.apache.org/docs/0.9/index.html">Overview<span class="hidden-sm hidden-xs"> 0.9.0</span></a></li>
<!-- Setup -->
<li class="dropdown">
<a href="http://flink.apache.org/docs/0.9/setup" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Setup <span class="caret"></span></a>
<ul class="dropdown-menu" role="menu">
<li><a href="http://flink.apache.org/docs/0.9/setup/building.html">Get Flink 0.9-SNAPSHOT</a></li>
<li class="divider"></li>
<li role="presentation" class="dropdown-header"><strong>Deployment</strong></li>
<li><a href="http://flink.apache.org/docs/0.9/setup/local_setup.html" class="active">Local</a></li>
<li><a href="http://flink.apache.org/docs/0.9/setup/cluster_setup.html">Cluster (Standalone)</a></li>
<li><a href="http://flink.apache.org/docs/0.9/setup/yarn_setup.html">YARN</a></li>
<li><a href="http://flink.apache.org/docs/0.9/setup/gce_setup.html">GCloud</a></li>
<li><a href="http://flink.apache.org/docs/0.9/setup/flink_on_tez.html">Flink on Tez <span class="badge">Beta</span></a></li>
<li class="divider"></li>
<li><a href="http://flink.apache.org/docs/0.9/setup/config.html">Configuration</a></li>
</ul>
</li>
<!-- Programming Guides -->
<li class="dropdown">
<a href="http://flink.apache.org/docs/0.9/apis" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Programming Guides <span class="caret"></span></a>
<ul class="dropdown-menu" role="menu">
<li><a href="http://flink.apache.org/docs/0.9/apis/programming_guide.html"><strong>Batch: DataSet API</strong></a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/streaming_guide.html"><strong>Streaming: DataStream API</strong> <span class="badge">Beta</span></a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/python.html">Python API <span class="badge">Beta</span></a></li>
<li class="divider"></li>
<li><a href="scala_shell.html">Interactive Scala Shell</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/dataset_transformations.html">Dataset Transformations</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/best_practices.html">Best Practices</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/example_connectors.html">Connectors</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/examples.html">Examples</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/local_execution.html">Local Execution</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/cluster_execution.html">Cluster Execution</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/cli.html">Command Line Interface</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/web_client.html">Web Client</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/iterations.html">Iterations</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/java8.html">Java 8</a></li>
<li><a href="http://flink.apache.org/docs/0.9/apis/hadoop_compatibility.html">Hadoop Compatability <span class="badge">Beta</span></a></li>
</ul>
</li>
<!-- Libraries -->
<li class="dropdown">
<a href="http://flink.apache.org/docs/0.9/libs" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Libraries <span class="caret"></span></a>
<ul class="dropdown-menu" role="menu">
<li><a href="http://flink.apache.org/docs/0.9/libs/spargel_guide.html">Graphs: Spargel</a></li>
<li><a href="http://flink.apache.org/docs/0.9/libs/gelly_guide.html">Graphs: Gelly <span class="badge">Beta</span></a></li>
<li><a href="http://flink.apache.org/docs/0.9/libs/ml/">Machine Learning <span class="badge">Beta</span></a></li>
<li><a href="http://flink.apache.org/docs/0.9/libs/table.html">Relational: Table <span class="badge">Beta</span></a></li>
</ul>
</li>
<!-- Internals -->
<li class="dropdown">
<a href="http://flink.apache.org/docs/0.9/internals" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Internals <span class="caret"></span></a>
<ul class="dropdown-menu" role="menu">
<li role="presentation" class="dropdown-header"><strong>Contribute</strong></li>
<li><a href="http://flink.apache.org/docs/0.9/internals/how_to_contribute.html">How to Contribute</a></li>
<li><a href="http://flink.apache.org/docs/0.9/internals/coding_guidelines.html">Coding Guidelines</a></li>
<li><a href="http://flink.apache.org/docs/0.9/internals/ide_setup.html">IDE Setup</a></li>
<li><a href="http://flink.apache.org/docs/0.9/internals/logging.html">Logging</a></li>
<li class="divider"></li>
<li role="presentation" class="dropdown-header"><strong>Internals</strong></li>
<li><a href="http://flink.apache.org/docs/0.9/internals/general_arch.html">Architecture &amp; Process Model</a></li>
<li><a href="http://flink.apache.org/docs/0.9/internals/types_serialization.html">Type Extraction &amp; Serialization</a></li>
<li><a href="http://flink.apache.org/docs/0.9/internals/job_scheduling.html">Jobs &amp; Scheduling</a></li>
<li><a href="http://flink.apache.org/docs/0.9/internals/add_operator.html">How-To: Add an Operator</a></li>
</ul>
</li>
</ul>
<form class="navbar-form navbar-right hidden-sm hidden-md" role="search" action="http://flink.apache.org/docs/0.9/search-results.html">
<div class="form-group">
<input type="text" class="form-control" name="q" placeholder="Search all pages">
</div>
<button type="submit" class="btn btn-default">Search</button>
</form>
</div><!-- /.navbar-collapse -->
</div><!-- /.container -->
</nav>
<!--Some of the Latex math notation has been adapted from Apache Spark MLlib's documentation-->
$$
\newcommand{\R}{\mathbb{R}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\newcommand{\wv}{\mathbf{w}}
\newcommand{\av}{\mathbf{\alpha}}
\newcommand{\bv}{\mathbf{b}}
\newcommand{\N}{\mathbb{N}}
\newcommand{\id}{\mathbf{I}}
\newcommand{\ind}{\mathbf{1}}
\newcommand{\0}{\mathbf{0}}
\newcommand{\unit}{\mathbf{e}}
\newcommand{\one}{\mathbf{1}}
\newcommand{\zero}{\mathbf{0}}
\newcommand\rfrac[2]{^{#1}\!/_{#2}}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
$$
<!-- Main content. -->
<div class="container">
<div class="row">
<div class="col-sm-10 col-sm-offset-1">
<h1><a href="../ml">FlinkML</a> - Quickstart Guide</h1>
<ul id="markdown-toc">
<li><a href="#introduction" id="markdown-toc-introduction">Introduction</a></li>
<li><a href="#linking-with-flinkml" id="markdown-toc-linking-with-flinkml">Linking with FlinkML</a></li>
<li><a href="#loading-data" id="markdown-toc-loading-data">Loading data</a></li>
<li><a href="#classification" id="markdown-toc-classification">Classification</a></li>
<li><a href="#data-pre-processing-and-pipelines" id="markdown-toc-data-pre-processing-and-pipelines">Data pre-processing and pipelines</a></li>
<li><a href="#where-to-go-from-here" id="markdown-toc-where-to-go-from-here">Where to go from here</a></li>
</ul>
<h2 id="introduction">Introduction</h2>
<p>FlinkML is designed to make learning from your data a straight-forward process, abstracting away
the complexities that usually come with big data learning tasks. In this
quick-start guide we will show just how easy it is to solve a simple supervised learning problem
using FlinkML. But first some basics, feel free to skip the next few lines if you’re already
familiar with Machine Learning (ML).</p>
<p>As defined by Murphy <a href="#murphy">[1]</a> ML deals with detecting patterns in data, and using those
learned patterns to make predictions about the future. We can categorize most ML algorithms into
two major categories: Supervised and Unsupervised Learning.</p>
<ul>
<li>
<p><strong>Supervised Learning</strong> deals with learning a function (mapping) from a set of inputs
(features) to a set of outputs. The learning is done using a <em>training set</em> of (input,
output) pairs that we use to approximate the mapping function. Supervised learning problems are
further divided into classification and regression problems. In classification problems we try to
predict the <em>class</em> that an example belongs to, for example whether a user is going to click on
an ad or not. Regression problems one the other hand, are about predicting (real) numerical
values, often called the dependent variable, for example what the temperature will be tomorrow.</p>
</li>
<li>
<p><strong>Unsupervised Learning</strong> deals with discovering patterns and regularities in the data. An example
of this would be <em>clustering</em>, where we try to discover groupings of the data from the
descriptive features. Unsupervised learning can also be used for feature selection, for example
through <a href="https://en.wikipedia.org/wiki/Principal_component_analysis">principal components analysis</a>.</p>
</li>
</ul>
<h2 id="linking-with-flinkml">Linking with FlinkML</h2>
<p>In order to use FlinkML in your project, first you have to
<a href="http://ci.apache.org/projects/flink/flink-docs-master/apis/programming_guide.html#linking-with-flink">set up a Flink program</a>.
Next, you have to add the FlinkML dependency to the <code>pom.xml</code> of your project:</p>
<div class="highlight"><pre><code class="language-xml" data-lang="xml"><span class="nt">&lt;dependency&gt;</span>
<span class="nt">&lt;groupId&gt;</span>org.apache.flink<span class="nt">&lt;/groupId&gt;</span>
<span class="nt">&lt;artifactId&gt;</span>flink-ml<span class="nt">&lt;/artifactId&gt;</span>
<span class="nt">&lt;version&gt;</span>0.9.0<span class="nt">&lt;/version&gt;</span>
<span class="nt">&lt;/dependency&gt;</span></code></pre></div>
<h2 id="loading-data">Loading data</h2>
<p>To load data to be used with FlinkML we can use the ETL capabilities of Flink, or specialized
functions for formatted data, such as the LibSVM format. For supervised learning problems it is
common to use the <code>LabeledVector</code> class to represent the <code>(label, features)</code> examples. A <code>LabeledVector</code>
object will have a FlinkML <code>Vector</code> member representing the features of the example and a <code>Double</code>
member which represents the label, which could be the class in a classification problem, or the dependent
variable for a regression problem.</p>
<p>As an example, we can use Haberman’s Survival Data Set , which you can
<a href="http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data">download from the UCI ML repository</a>.
This dataset <em>“contains cases from a study conducted on the survival of patients who had undergone
surgery for breast cancer”</em>. The data comes in a comma-separated file, where the first 3 columns
are the features and last column is the class, and the 4th column indicates whether the patient
survived 5 years or longer (label 1), or died within 5 years (label 2). You can check the <a href="https://archive.ics.uci.edu/ml/datasets/Haberman%27s+Survival">UCI
page</a> for more information on the data.</p>
<p>We can load the data as a <code>DataSet[String]</code> first:</p>
<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.flink.api.scala.ExecutionEnvironment</span>
<span class="k">val</span> <span class="n">env</span> <span class="k">=</span> <span class="nc">ExecutionEnvironment</span><span class="o">.</span><span class="n">getExecutionEnvironment</span>
<span class="k">val</span> <span class="n">survival</span> <span class="k">=</span> <span class="n">env</span><span class="o">.</span><span class="n">readCsvFile</span><span class="o">[(</span><span class="kt">String</span>, <span class="kt">String</span>, <span class="kt">String</span>, <span class="kt">String</span><span class="o">)](</span><span class="s">&quot;/path/to/haberman.data&quot;</span><span class="o">)</span></code></pre></div>
<p>We can now transform the data into a <code>DataSet[LabeledVector]</code>. This will allow us to use the
dataset with the FlinkML classification algorithms. We know that the 4th element of the dataset
is the class label, and the rest are features, so we can build <code>LabeledVector</code> elements like this:</p>
<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.flink.ml.common.LabeledVector</span>
<span class="k">import</span> <span class="nn">org.apache.flink.ml.math.DenseVector</span>
<span class="k">val</span> <span class="n">survivalLV</span> <span class="k">=</span> <span class="n">survival</span>
<span class="o">.</span><span class="n">map</span><span class="o">{</span><span class="n">tuple</span> <span class="k">=&gt;</span>
<span class="k">val</span> <span class="n">list</span> <span class="k">=</span> <span class="n">tuple</span><span class="o">.</span><span class="n">productIterator</span><span class="o">.</span><span class="n">toList</span>
<span class="k">val</span> <span class="n">numList</span> <span class="k">=</span> <span class="n">list</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">asInstanceOf</span><span class="o">[</span><span class="kt">String</span><span class="o">].</span><span class="n">toDouble</span><span class="o">)</span>
<span class="nc">LabeledVector</span><span class="o">(</span><span class="n">numList</span><span class="o">(</span><span class="mi">3</span><span class="o">),</span> <span class="nc">DenseVector</span><span class="o">(</span><span class="n">numList</span><span class="o">.</span><span class="n">take</span><span class="o">(</span><span class="mi">3</span><span class="o">).</span><span class="n">toArray</span><span class="o">))</span>
<span class="o">}</span></code></pre></div>
<p>We can then use this data to train a learner. We will however use another dataset to exemplify
building a learner; that will allow us to show how we can import other dataset formats.</p>
<p><strong>LibSVM files</strong></p>
<p>A common format for ML datasets is the LibSVM format and a number of datasets using that format can be
found <a href="http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/">in the LibSVM datasets website</a>. FlinkML provides utilities for loading
datasets using the LibSVM format through the <code>readLibSVM</code> function available through the <code>MLUtils</code>
object.
You can also save datasets in the LibSVM format using the <code>writeLibSVM</code> function.
Let’s import the svmguide1 dataset. You can download the
<a href="http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/svmguide1">training set here</a>
and the <a href="http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/svmguide1.t">test set here</a>.
This is an astroparticle binary classification dataset, used by Hsu et al. <a href="#hsu">[3]</a> in their
practical Support Vector Machine (SVM) guide. It contains 4 numerical features, and the class label.</p>
<p>We can simply import the dataset then using:</p>
<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.flink.ml.MLUtils</span>
<span class="k">val</span> <span class="n">astroTrain</span><span class="k">:</span> <span class="kt">DataSet</span><span class="o">[</span><span class="kt">LabeledVector</span><span class="o">]</span> <span class="k">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="n">readLibSVM</span><span class="o">(</span><span class="s">&quot;/path/to/svmguide1&quot;</span><span class="o">)</span>
<span class="k">val</span> <span class="n">astroTest</span><span class="k">:</span> <span class="kt">DataSet</span><span class="o">[</span><span class="kt">LabeledVector</span><span class="o">]</span> <span class="k">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="n">readLibSVM</span><span class="o">(</span><span class="s">&quot;/path/to/svmguide1.t&quot;</span><span class="o">)</span></code></pre></div>
<p>This gives us two <code>DataSet[LabeledVector]</code> objects that we will use in the following section to
create a classifier.</p>
<h2 id="classification">Classification</h2>
<p>Once we have imported the dataset we can train a <code>Predictor</code> such as a linear SVM classifier.
We can set a number of parameters for the classifier. Here we set the <code>Blocks</code> parameter,
which is used to split the input by the underlying CoCoA algorithm <a href="#jaggi">[2]</a> uses. The
regularization parameter determines the amount of $l_2$ regularization applied, which is used
to avoid overfitting. The step size determines the contribution of the weight vector updates to
the next weight vector value. This parameter sets the initial step size.</p>
<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.flink.ml.classification.SVM</span>
<span class="k">val</span> <span class="n">svm</span> <span class="k">=</span> <span class="nc">SVM</span><span class="o">()</span>
<span class="o">.</span><span class="n">setBlocks</span><span class="o">(</span><span class="n">env</span><span class="o">.</span><span class="n">getParallelism</span><span class="o">)</span>
<span class="o">.</span><span class="n">setIterations</span><span class="o">(</span><span class="mi">100</span><span class="o">)</span>
<span class="o">.</span><span class="n">setRegularization</span><span class="o">(</span><span class="mf">0.001</span><span class="o">)</span>
<span class="o">.</span><span class="n">setStepsize</span><span class="o">(</span><span class="mf">0.1</span><span class="o">)</span>
<span class="o">.</span><span class="n">setSeed</span><span class="o">(</span><span class="mi">42</span><span class="o">)</span>
<span class="n">svm</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">astroTrain</span><span class="o">)</span></code></pre></div>
<p>We can now make predictions on the test set.</p>
<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">val</span> <span class="n">predictionPairs</span> <span class="k">=</span> <span class="n">svm</span><span class="o">.</span><span class="n">predict</span><span class="o">(</span><span class="n">astroTest</span><span class="o">)</span></code></pre></div>
<p>Next we will see how we can pre-process our data, and use the ML pipelines capabilities of FlinkML.</p>
<h2 id="data-pre-processing-and-pipelines">Data pre-processing and pipelines</h2>
<p>A pre-processing step that is often encouraged <a href="#hsu">[3]</a> when using SVM classification is scaling
the input features to the [0, 1] range, in order to avoid features with extreme values
dominating the rest.
FlinkML has a number of <code>Transformers</code> such as <code>MinMaxScaler</code> that are used to pre-process data,
and a key feature is the ability to chain <code>Transformers</code> and <code>Predictors</code> together. This allows
us to run the same pipeline of transformations and make predictions on the train and test data in
a straight-forward and type-safe manner. You can read more on the pipeline system of FlinkML
<a href="pipelines.html">in the pipelines documentation</a>.</p>
<p>Let us first create a normalizing transformer for the features in our dataset, and chain it to a
new SVM classifier.</p>
<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.flink.ml.preprocessing.MinMaxScaler</span>
<span class="k">val</span> <span class="n">scaler</span> <span class="k">=</span> <span class="nc">MinMaxScaler</span><span class="o">()</span>
<span class="k">val</span> <span class="n">scaledSVM</span> <span class="k">=</span> <span class="n">scaler</span><span class="o">.</span><span class="n">chainPredictor</span><span class="o">(</span><span class="n">svm</span><span class="o">)</span></code></pre></div>
<p>We can now use our newly created pipeline to make predictions on the test set.
First we call fit again, to train the scaler and the SVM classifier.
The data of the test set will then be automatically scaled before being passed on to the SVM to
make predictions.</p>
<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="n">scaledSVM</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">astroTrain</span><span class="o">)</span>
<span class="k">val</span> <span class="n">predictionPairsScaled</span><span class="k">:</span> <span class="kt">DataSet</span><span class="o">[(</span><span class="kt">Double</span>, <span class="kt">Double</span><span class="o">)]</span> <span class="k">=</span> <span class="n">scaledSVM</span><span class="o">.</span><span class="n">predict</span><span class="o">(</span><span class="n">astroTest</span><span class="o">)</span></code></pre></div>
<p>The scaled inputs should give us better prediction performance.
The result of the prediction on <code>LabeledVector</code>s is a data set of tuples where the first entry denotes the true label value and the second entry is the predicted label value.</p>
<h2 id="where-to-go-from-here">Where to go from here</h2>
<p>This quickstart guide can act as an introduction to the basic concepts of FlinkML, but there’s a lot
more you can do.
We recommend going through the <a href="index.html">FlinkML documentation</a>, and trying out the different
algorithms.
A very good way to get started is to play around with interesting datasets from the UCI ML
repository and the LibSVM datasets.
Tackling an interesting problem from a website like <a href="https://www.kaggle.com">Kaggle</a> or
<a href="http://www.drivendata.org/">DrivenData</a> is also a great way to learn by competing with other
data scientists.
If you would like to contribute some new algorithms take a look at our
<a href="contribution_guide.html">contribution guide</a>.</p>
<p><strong>References</strong></p>
<p><a name="murphy"></a>[1] Murphy, Kevin P. <em>Machine learning: a probabilistic perspective.</em> MIT
press, 2012.</p>
<p><a name="jaggi"></a>[2] Jaggi, Martin, et al. <em>Communication-efficient distributed dual
coordinate ascent.</em> Advances in Neural Information Processing Systems. 2014.</p>
<p><a name="hsu"></a>[3] Hsu, Chih-Wei, Chih-Chung Chang, and Chih-Jen Lin.
<em>A practical guide to support vector classification.</em> 2003.</p>
</div>
<div class="col-sm-10 col-sm-offset-1">
<!-- Disqus thread and some vertical offset -->
<div style="margin-top: 75px; margin-bottom: 50px" id="disqus_thread"></div>
</div>
</div>
</div><!-- /.container -->
<!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.2/jquery.min.js"></script>
<!-- Include all compiled plugins (below), or include individual files as needed -->
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script>
<script src="http://flink.apache.org/docs/0.9/page/js/codetabs.js"></script>
<!-- Google Analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-52545728-1', 'auto');
ga('send', 'pageview');
</script>
<!-- Disqus -->
<script type="text/javascript">
var disqus_shortname = 'stratosphere-eu';
(function() {
var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true;
dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js';
(document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);
})();
</script>
</body>
</html>