site/docs/3.5.0/mllib-linear-methods.html - spark-website - Git at Google


 <!DOCTYPE html>
 <!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
 <!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
 <!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
 <!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
     <head>
         <meta charset="utf-8">
         <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
         <meta name="viewport" content="width=device-width, initial-scale=1.0">

         <title>Linear Methods - RDD-based API - Spark 3.5.0 Documentation</title>


         <link rel="stylesheet" href="css/bootstrap.min.css">
         <link rel="preconnect" href="https://fonts.googleapis.com">
         <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
         <link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,wght@0,400;0,500;0,700;1,400;1,500;1,700&Courier+Prime:wght@400;700&display=swap" rel="stylesheet">
         <link href="css/custom.css" rel="stylesheet">
         <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>

         <link rel="stylesheet" href="css/pygments-default.css">
         <link rel="stylesheet" href="css/docsearch.min.css" />
         <link rel="stylesheet" href="css/docsearch.css">

     <!-- Matomo -->
     <script type="text/javascript">
         var _paq = window._paq = window._paq || [];
         /* tracker methods like "setCustomDimension" should be called before "trackPageView" */
         _paq.push(["disableCookies"]);
         _paq.push(['trackPageView']);
         _paq.push(['enableLinkTracking']);
         (function() {
             var u="https://analytics.apache.org/";
             _paq.push(['setTrackerUrl', u+'matomo.php']);
             _paq.push(['setSiteId', '40']);
             var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
             g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
         })();
     </script>
     <!-- End Matomo Code -->
     </head>
     <body class="global">
         <!--[if lt IE 7]>
             <p class="chromeframe">You are using an outdated browser. <a href="https://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
         <![endif]-->

         <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->

         <nav class="navbar navbar-expand-lg navbar-dark p-0 px-4 fixed-top" style="background: #1d6890;" id="topbar">
             <div class="navbar-brand"><a href="index.html">
                 <img src="img/spark-logo-rev.svg" width="141" height="72"/></a><span class="version">3.5.0</span>
             </div>
             <button class="navbar-toggler" type="button" data-toggle="collapse"
                     data-target="#navbarCollapse" aria-controls="navbarCollapse"
                     aria-expanded="false" aria-label="Toggle navigation">
                 <span class="navbar-toggler-icon"></span>
             </button>
             <div class="collapse navbar-collapse" id="navbarCollapse">
                 <ul class="navbar-nav me-auto">
                     <li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li>

                     <li class="nav-item dropdown">
                         <a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a>
                         <div class="dropdown-menu" aria-labelledby="navbarQuickStart">
                             <a class="dropdown-item" href="quick-start.html">Quick Start</a>
                             <a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a>
                             <a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a>
                             <a class="dropdown-item" href="structured-streaming-programming-guide.html">Structured Streaming</a>
                             <a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a>
                             <a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a>
                             <a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a>
                             <a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a>
                             <a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a>
                         </div>
                     </li>

                     <li class="nav-item dropdown">
                         <a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a>
                         <div class="dropdown-menu" aria-labelledby="navbarAPIDocs">
                             <a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a>
                             <a class="dropdown-item" href="api/java/index.html">Java</a>
                             <a class="dropdown-item" href="api/python/index.html">Python</a>
                             <a class="dropdown-item" href="api/R/index.html">R</a>
                             <a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a>
                         </div>
                     </li>

                     <li class="nav-item dropdown">
                         <a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a>
                         <div class="dropdown-menu" aria-labelledby="navbarDeploying">
                             <a class="dropdown-item" href="cluster-overview.html">Overview</a>
                             <a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a>
                             <div class="dropdown-divider"></div>
                             <a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a>
                             <a class="dropdown-item" href="running-on-mesos.html">Mesos</a>
                             <a class="dropdown-item" href="running-on-yarn.html">YARN</a>
                             <a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a>
                         </div>
                     </li>

                     <li class="nav-item dropdown">
                         <a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a>
                         <div class="dropdown-menu" aria-labelledby="navbarMore">
                             <a class="dropdown-item" href="configuration.html">Configuration</a>
                             <a class="dropdown-item" href="monitoring.html">Monitoring</a>
                             <a class="dropdown-item" href="tuning.html">Tuning Guide</a>
                             <a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a>
                             <a class="dropdown-item" href="security.html">Security</a>
                             <a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a>
                             <a class="dropdown-item" href="migration-guide.html">Migration Guide</a>
                             <div class="dropdown-divider"></div>
                             <a class="dropdown-item" href="building-spark.html">Building Spark</a>
                             <a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a>
                             <a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a>
                         </div>
                     </li>

                     <li class="nav-item">
                         <input type="text" id="docsearch-input" placeholder="Search the docs…">
                     </li>
                 </ul>
                 <!--<span class="navbar-text navbar-right"><span class="version-text">v3.5.0</span></span>-->
             </div>
         </nav>


         <div class="container">


                     <div class="left-menu-wrapper">
     <div class="left-menu">
         <h3><a href="ml-guide.html">MLlib: Main Guide</a></h3>

 <ul>

     <li>
         <a href="ml-statistics.html">

                 Basic statistics

         </a>
     </li>


     <li>
         <a href="ml-datasource.html">

                 Data sources

         </a>
     </li>


     <li>
         <a href="ml-pipeline.html">

                 Pipelines

         </a>
     </li>


     <li>
         <a href="ml-features.html">

                 Extracting, transforming and selecting features

         </a>
     </li>


     <li>
         <a href="ml-classification-regression.html">

                 Classification and Regression

         </a>
     </li>


     <li>
         <a href="ml-clustering.html">

                 Clustering

         </a>
     </li>


     <li>
         <a href="ml-collaborative-filtering.html">

                 Collaborative filtering

         </a>
     </li>


     <li>
         <a href="ml-frequent-pattern-mining.html">

                 Frequent Pattern Mining

         </a>
     </li>


     <li>
         <a href="ml-tuning.html">

                 Model selection and tuning

         </a>
     </li>


     <li>
         <a href="ml-advanced.html">

                 Advanced topics

         </a>
     </li>


 </ul>

         <h3><a href="mllib-guide.html">MLlib: RDD-based API Guide</a></h3>

 <ul>

     <li>
         <a href="mllib-data-types.html">

                 Data types

         </a>
     </li>


     <li>
         <a href="mllib-statistics.html">

                 Basic statistics

         </a>
     </li>


     <li>
         <a href="mllib-classification-regression.html">

                 Classification and regression

         </a>
     </li>


     <li>
         <a href="mllib-collaborative-filtering.html">

                 Collaborative filtering

         </a>
     </li>


     <li>
         <a href="mllib-clustering.html">

                 Clustering

         </a>
     </li>


     <li>
         <a href="mllib-dimensionality-reduction.html">

                 Dimensionality reduction

         </a>
     </li>


     <li>
         <a href="mllib-feature-extraction.html">

                 Feature extraction and transformation

         </a>
     </li>


     <li>
         <a href="mllib-frequent-pattern-mining.html">

                 Frequent pattern mining

         </a>
     </li>


     <li>
         <a href="mllib-evaluation-metrics.html">

                 Evaluation metrics

         </a>
     </li>


     <li>
         <a href="mllib-pmml-model-export.html">

                 PMML model export

         </a>
     </li>


     <li>
         <a href="mllib-optimization.html">

                 Optimization (developer)

         </a>
     </li>


 </ul>

     </div>
 </div>

                 <input id="nav-trigger" class="nav-trigger" checked type="checkbox">
                 <label for="nav-trigger"></label>
                 <div class="content-with-sidebar mr-3" id="content">

                         <h1 class="title">Linear Methods - RDD-based API</h1>


                     <ul id="markdown-toc">
   <li><a href="#mathematical-formulation" id="markdown-toc-mathematical-formulation">Mathematical formulation</a>    <ul>
       <li><a href="#loss-functions" id="markdown-toc-loss-functions">Loss functions</a></li>
       <li><a href="#regularizers" id="markdown-toc-regularizers">Regularizers</a></li>
       <li><a href="#optimization" id="markdown-toc-optimization">Optimization</a></li>
     </ul>
   </li>
   <li><a href="#classification" id="markdown-toc-classification">Classification</a>    <ul>
       <li><a href="#linear-support-vector-machines-svms" id="markdown-toc-linear-support-vector-machines-svms">Linear Support Vector Machines (SVMs)</a></li>
       <li><a href="#logistic-regression" id="markdown-toc-logistic-regression">Logistic regression</a></li>
     </ul>
   </li>
   <li><a href="#regression" id="markdown-toc-regression">Regression</a>    <ul>
       <li><a href="#linear-least-squares-lasso-and-ridge-regression" id="markdown-toc-linear-least-squares-lasso-and-ridge-regression">Linear least squares, Lasso, and ridge regression</a></li>
       <li><a href="#streaming-linear-regression" id="markdown-toc-streaming-linear-regression">Streaming linear regression</a></li>
     </ul>
   </li>
   <li><a href="#implementation-developer" id="markdown-toc-implementation-developer">Implementation (developer)</a></li>
 </ul>

 <p><code class="language-plaintext highlighter-rouge">\[
 \newcommand{\R}{\mathbb{R}}
 \newcommand{\E}{\mathbb{E}}
 \newcommand{\x}{\mathbf{x}}
 \newcommand{\y}{\mathbf{y}}
 \newcommand{\wv}{\mathbf{w}}
 \newcommand{\av}{\mathbf{\alpha}}
 \newcommand{\bv}{\mathbf{b}}
 \newcommand{\N}{\mathbb{N}}
 \newcommand{\id}{\mathbf{I}}
 \newcommand{\ind}{\mathbf{1}}
 \newcommand{\0}{\mathbf{0}}
 \newcommand{\unit}{\mathbf{e}}
 \newcommand{\one}{\mathbf{1}}
 \newcommand{\zero}{\mathbf{0}}
 \]</code></p>

 <h2 id="mathematical-formulation">Mathematical formulation</h2>

 <p>Many standard <em>machine learning</em> methods can be formulated as a convex optimization problem, i.e.
 the task of finding a minimizer of a convex function <code class="language-plaintext highlighter-rouge">$f$</code> that depends on a variable vector
 <code class="language-plaintext highlighter-rouge">$\wv$</code> (called <code class="language-plaintext highlighter-rouge">weights</code> in the code), which has <code class="language-plaintext highlighter-rouge">$d$</code> entries.
 Formally, we can write this as the optimization problem <code class="language-plaintext highlighter-rouge">$\min_{\wv \in\R^d} \; f(\wv)$</code>, where
 the objective function is of the form
 <code class="language-plaintext highlighter-rouge">\begin{equation}
     f(\wv) := \lambda\, R(\wv) +
     \frac1n \sum_{i=1}^n L(\wv;\x_i,y_i)
     \label{eq:regPrimal}
     \ .
 \end{equation}</code>
 Here the vectors <code class="language-plaintext highlighter-rouge">$\x_i\in\R^d$</code> are the training data examples, for <code class="language-plaintext highlighter-rouge">$1\le i\le n$</code>, and
 <code class="language-plaintext highlighter-rouge">$y_i\in\R$</code> are their corresponding labels, which we want to predict.
 We call the method <em>linear</em> if $L(\wv; \x, y)$ can be expressed as a function of $\wv^T x$ and $y$.
 Several of <code class="language-plaintext highlighter-rouge">spark.mllib</code>&#8217;s classification and regression algorithms fall into this category,
 and are discussed here.</p>

 <p>The objective function <code class="language-plaintext highlighter-rouge">$f$</code> has two parts:
 the regularizer that controls the complexity of the model,
 and the loss that measures the error of the model on the training data.
 The loss function <code class="language-plaintext highlighter-rouge">$L(\wv;.)$</code> is typically a convex function in <code class="language-plaintext highlighter-rouge">$\wv$</code>.  The
 fixed regularization parameter <code class="language-plaintext highlighter-rouge">$\lambda \ge 0$</code> (<code class="language-plaintext highlighter-rouge">regParam</code> in the code)
 defines the trade-off between the two goals of minimizing the loss (i.e.,
 training error) and minimizing model complexity (i.e., to avoid overfitting).</p>

 <h3 id="loss-functions">Loss functions</h3>

 <p>The following table summarizes the loss functions and their gradients or sub-gradients for the
 methods <code class="language-plaintext highlighter-rouge">spark.mllib</code> supports:</p>

 <table class="table table-striped">
   <thead>
     <tr><th></th><th>loss function $L(\wv; \x, y)$</th><th>gradient or sub-gradient</th></tr>
   </thead>
   <tbody>
     <tr>
       <td>hinge loss</td><td>$\max \{0, 1-y \wv^T \x \}, \quad y \in \{-1, +1\}$</td>
       <td>$\begin{cases}-y \cdot \x &amp; \text{if $y \wv^T \x &lt;1$}, \\ 0 &amp;
 \text{otherwise}.\end{cases}$</td>
     </tr>
     <tr>
       <td>logistic loss</td><td>$\log(1+\exp( -y \wv^T \x)), \quad y \in \{-1, +1\}$</td>
       <td>$-y \left(1-\frac1{1+\exp(-y \wv^T \x)} \right) \cdot \x$</td>
     </tr>
     <tr>
       <td>squared loss</td><td>$\frac{1}{2} (\wv^T \x - y)^2, \quad y \in \R$</td>
       <td>$(\wv^T \x - y) \cdot \x$</td>
     </tr>
   </tbody>
 </table>

 <p>Note that, in the mathematical formulation above, a binary label $y$ is denoted as either
 $+1$ (positive) or $-1$ (negative), which is convenient for the formulation.
 <em>However</em>, the negative label is represented by $0$ in <code class="language-plaintext highlighter-rouge">spark.mllib</code> instead of $-1$, to be consistent with
 multiclass labeling.</p>

 <h3 id="regularizers">Regularizers</h3>

 <p>The purpose of the
 <a href="http://en.wikipedia.org/wiki/Regularization_(mathematics)">regularizer</a> is to
 encourage simple models and avoid overfitting.  We support the following
 regularizers in <code class="language-plaintext highlighter-rouge">spark.mllib</code>:</p>

 <table class="table table-striped">
   <thead>
     <tr><th></th><th>regularizer $R(\wv)$</th><th>gradient or sub-gradient</th></tr>
   </thead>
   <tbody>
     <tr>
       <td>zero (unregularized)</td><td>0</td><td>$\0$</td>
     </tr>
     <tr>
       <td>L2</td><td>$\frac{1}{2}\|\wv\|_2^2$</td><td>$\wv$</td>
     </tr>
     <tr>
       <td>L1</td><td>$\|\wv\|_1$</td><td>$\mathrm{sign}(\wv)$</td>
     </tr>
     <tr>
       <td>elastic net</td><td>$\alpha \|\wv\|_1 + (1-\alpha)\frac{1}{2}\|\wv\|_2^2$</td><td>$\alpha \mathrm{sign}(\wv) + (1-\alpha) \wv$</td>
     </tr>
   </tbody>
 </table>

 <p>Here <code class="language-plaintext highlighter-rouge">$\mathrm{sign}(\wv)$</code> is the vector consisting of the signs (<code class="language-plaintext highlighter-rouge">$\pm1$</code>) of all the entries
 of <code class="language-plaintext highlighter-rouge">$\wv$</code>.</p>

 <p>L2-regularized problems are generally easier to solve than L1-regularized due to smoothness.
 However, L1 regularization can help promote sparsity in weights leading to smaller and more interpretable models, the latter of which can be useful for feature selection.
 <a href="http://en.wikipedia.org/wiki/Elastic_net_regularization">Elastic net</a> is a combination of L1 and L2 regularization. It is not recommended to train models without any regularization,
 especially when the number of training examples is small.</p>

 <h3 id="optimization">Optimization</h3>

 <p>Under the hood, linear methods use convex optimization methods to optimize the objective functions.
 <code class="language-plaintext highlighter-rouge">spark.mllib</code> uses two methods, SGD and L-BFGS, described in the <a href="mllib-optimization.html">optimization section</a>.
 Currently, most algorithm APIs support Stochastic Gradient Descent (SGD), and a few support L-BFGS.
 Refer to <a href="mllib-optimization.html#Choosing-an-Optimization-Method">this optimization section</a> for guidelines on choosing between optimization methods.</p>

 <h2 id="classification">Classification</h2>

 <p><a href="http://en.wikipedia.org/wiki/Statistical_classification">Classification</a> aims to divide items into
 categories.
 The most common classification type is
 <a href="http://en.wikipedia.org/wiki/Binary_classification">binary classification</a>, where there are two
 categories, usually named positive and negative.
 If there are more than two categories, it is called
 <a href="http://en.wikipedia.org/wiki/Multiclass_classification">multiclass classification</a>.
 <code class="language-plaintext highlighter-rouge">spark.mllib</code> supports two linear methods for classification: linear Support Vector Machines (SVMs)
 and logistic regression.
 Linear SVMs supports only binary classification, while logistic regression supports both binary and
 multiclass classification problems.
 For both methods, <code class="language-plaintext highlighter-rouge">spark.mllib</code> supports L1 and L2 regularized variants.
 The training data set is represented by an RDD of <a href="mllib-data-types.html#labeled-point">LabeledPoint</a> in MLlib,
 where labels are class indices starting from zero: $0, 1, 2, \ldots$.</p>

 <h3 id="linear-support-vector-machines-svms">Linear Support Vector Machines (SVMs)</h3>

 <p>The <a href="http://en.wikipedia.org/wiki/Support_vector_machine#Linear_SVM">linear SVM</a>
 is a standard method for large-scale classification tasks. It is a linear method as described above in equation <code class="language-plaintext highlighter-rouge">$\eqref{eq:regPrimal}$</code>, with the loss function in the formulation given by the hinge loss:</p>

 <p><code class="language-plaintext highlighter-rouge">\[
 L(\wv;\x,y) := \max \{0, 1-y \wv^T \x \}.
 \]</code>
 By default, linear SVMs are trained with an L2 regularization.
 We also support alternative L1 regularization. In this case,
 the problem becomes a <a href="http://en.wikipedia.org/wiki/Linear_programming">linear program</a>.</p>

 <p>The linear SVMs algorithm outputs an SVM model. Given a new data point,
 denoted by $\x$, the model makes predictions based on the value of $\wv^T \x$.
 By the default, if $\wv^T \x \geq 0$ then the outcome is positive, and negative
 otherwise.</p>

 <p><strong>Examples</strong></p>

 <div class="codetabs">

 <div data-lang="python">
     <p>The following example shows how to load a sample dataset, build SVM model,
 and make predictions with the resulting model to compute the training error.</p>

     <p>Refer to the <a href="api/python/reference/api/pyspark.mllib.classification.SVMWithSGD.html"><code class="language-plaintext highlighter-rouge">SVMWithSGD</code> Python docs</a> and <a href="api/python/reference/api/pyspark.mllib.classification.SVMModel.html"><code class="language-plaintext highlighter-rouge">SVMModel</code> Python docs</a> for more details on the API.</p>

     <div class="highlight"><pre class="codehilite"><code><span class="kn">from</span> <span class="nn">pyspark.mllib.classification</span> <span class="kn">import</span> <span class="n">SVMWithSGD</span><span class="p">,</span> <span class="n">SVMModel</span>
 <span class="kn">from</span> <span class="nn">pyspark.mllib.regression</span> <span class="kn">import</span> <span class="n">LabeledPoint</span>

 <span class="c1"># Load and parse the data
 </span><span class="k">def</span> <span class="nf">parsePoint</span><span class="p">(</span><span class="n">line</span><span class="p">):</span>
     <span class="n">values</span> <span class="o">=</span> <span class="p">[</span><span class="nb">float</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">line</span><span class="p">.</span><span class="n">split</span><span class="p">(</span><span class="s">' '</span><span class="p">)]</span>
     <span class="k">return</span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="n">values</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">values</span><span class="p">[</span><span class="mi">1</span><span class="p">:])</span>

 <span class="n">data</span> <span class="o">=</span> <span class="n">sc</span><span class="p">.</span><span class="n">textFile</span><span class="p">(</span><span class="s">"data/mllib/sample_svm_data.txt"</span><span class="p">)</span>
 <span class="n">parsedData</span> <span class="o">=</span> <span class="n">data</span><span class="p">.</span><span class="nb">map</span><span class="p">(</span><span class="n">parsePoint</span><span class="p">)</span>

 <span class="c1"># Build the model
 </span><span class="n">model</span> <span class="o">=</span> <span class="n">SVMWithSGD</span><span class="p">.</span><span class="n">train</span><span class="p">(</span><span class="n">parsedData</span><span class="p">,</span> <span class="n">iterations</span><span class="o">=</span><span class="mi">100</span><span class="p">)</span>

 <span class="c1"># Evaluating the model on training data
 </span><span class="n">labelsAndPreds</span> <span class="o">=</span> <span class="n">parsedData</span><span class="p">.</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="p">(</span><span class="n">p</span><span class="p">.</span><span class="n">label</span><span class="p">,</span> <span class="n">model</span><span class="p">.</span><span class="n">predict</span><span class="p">(</span><span class="n">p</span><span class="p">.</span><span class="n">features</span><span class="p">)))</span>
 <span class="n">trainErr</span> <span class="o">=</span> <span class="n">labelsAndPreds</span><span class="p">.</span><span class="nb">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">lp</span><span class="p">:</span> <span class="n">lp</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">!=</span> <span class="n">lp</span><span class="p">[</span><span class="mi">1</span><span class="p">]).</span><span class="n">count</span><span class="p">()</span> <span class="o">/</span> <span class="nb">float</span><span class="p">(</span><span class="n">parsedData</span><span class="p">.</span><span class="n">count</span><span class="p">())</span>
 <span class="k">print</span><span class="p">(</span><span class="s">"Training Error = "</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">trainErr</span><span class="p">))</span>

 <span class="c1"># Save and load model
 </span><span class="n">model</span><span class="p">.</span><span class="n">save</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="s">"target/tmp/pythonSVMWithSGDModel"</span><span class="p">)</span>
 <span class="n">sameModel</span> <span class="o">=</span> <span class="n">SVMModel</span><span class="p">.</span><span class="n">load</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="s">"target/tmp/pythonSVMWithSGDModel"</span><span class="p">)</span></code></pre></div>
     <div><small>Find full example code at "examples/src/main/python/mllib/svm_with_sgd_example.py" in the Spark repo.</small></div>
   </div>

 <div data-lang="scala">
     <p>The following code snippet illustrates how to load a sample dataset, execute a
 training algorithm on this training data using a static method in the algorithm
 object, and make predictions with the resulting model to compute the training
 error.</p>

     <p>Refer to the <a href="api/scala/org/apache/spark/mllib/classification/SVMWithSGD.html"><code class="language-plaintext highlighter-rouge">SVMWithSGD</code> Scala docs</a> and <a href="api/scala/org/apache/spark/mllib/classification/SVMModel.html"><code class="language-plaintext highlighter-rouge">SVMModel</code> Scala docs</a> for details on the API.</p>

     <div class="highlight"><pre class="codehilite"><code><span class="k">import</span> <span class="nn">org.apache.spark.mllib.classification.</span><span class="o">{</span><span class="nc">SVMModel</span><span class="o">,</span> <span class="nc">SVMWithSGD</span><span class="o">}</span>
 <span class="k">import</span> <span class="nn">org.apache.spark.mllib.evaluation.BinaryClassificationMetrics</span>
 <span class="k">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span>

 <span class="c1">// Load training data in LIBSVM format.</span>
 <span class="k">val</span> <span class="nv">data</span> <span class="k">=</span> <span class="nv">MLUtils</span><span class="o">.</span><span class="py">loadLibSVMFile</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">)</span>

 <span class="c1">// Split data into training (60%) and test (40%).</span>
 <span class="k">val</span> <span class="nv">splits</span> <span class="k">=</span> <span class="nv">data</span><span class="o">.</span><span class="py">randomSplit</span><span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="mf">0.6</span><span class="o">,</span> <span class="mf">0.4</span><span class="o">),</span> <span class="n">seed</span> <span class="k">=</span> <span class="mi">11L</span><span class="o">)</span>
 <span class="k">val</span> <span class="nv">training</span> <span class="k">=</span> <span class="nf">splits</span><span class="o">(</span><span class="mi">0</span><span class="o">).</span><span class="py">cache</span><span class="o">()</span>
 <span class="k">val</span> <span class="nv">test</span> <span class="k">=</span> <span class="nf">splits</span><span class="o">(</span><span class="mi">1</span><span class="o">)</span>

 <span class="c1">// Run training algorithm to build the model</span>
 <span class="k">val</span> <span class="nv">numIterations</span> <span class="k">=</span> <span class="mi">100</span>
 <span class="k">val</span> <span class="nv">model</span> <span class="k">=</span> <span class="nv">SVMWithSGD</span><span class="o">.</span><span class="py">train</span><span class="o">(</span><span class="n">training</span><span class="o">,</span> <span class="n">numIterations</span><span class="o">)</span>

 <span class="c1">// Clear the default threshold.</span>
 <span class="nv">model</span><span class="o">.</span><span class="py">clearThreshold</span><span class="o">()</span>

 <span class="c1">// Compute raw scores on the test set.</span>
 <span class="k">val</span> <span class="nv">scoreAndLabels</span> <span class="k">=</span> <span class="nv">test</span><span class="o">.</span><span class="py">map</span> <span class="o">{</span> <span class="n">point</span> <span class="k">=&gt;</span>
   <span class="k">val</span> <span class="nv">score</span> <span class="k">=</span> <span class="nv">model</span><span class="o">.</span><span class="py">predict</span><span class="o">(</span><span class="nv">point</span><span class="o">.</span><span class="py">features</span><span class="o">)</span>
   <span class="o">(</span><span class="n">score</span><span class="o">,</span> <span class="nv">point</span><span class="o">.</span><span class="py">label</span><span class="o">)</span>
 <span class="o">}</span>

 <span class="c1">// Get evaluation metrics.</span>
 <span class="k">val</span> <span class="nv">metrics</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">BinaryClassificationMetrics</span><span class="o">(</span><span class="n">scoreAndLabels</span><span class="o">)</span>
 <span class="k">val</span> <span class="nv">auROC</span> <span class="k">=</span> <span class="nv">metrics</span><span class="o">.</span><span class="py">areaUnderROC</span><span class="o">()</span>

 <span class="nf">println</span><span class="o">(</span><span class="n">s</span><span class="s">"Area under ROC = $auROC"</span><span class="o">)</span>

 <span class="c1">// Save and load model</span>
 <span class="nv">model</span><span class="o">.</span><span class="py">save</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">"target/tmp/scalaSVMWithSGDModel"</span><span class="o">)</span>
 <span class="k">val</span> <span class="nv">sameModel</span> <span class="k">=</span> <span class="nv">SVMModel</span><span class="o">.</span><span class="py">load</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">"target/tmp/scalaSVMWithSGDModel"</span><span class="o">)</span></code></pre></div>
     <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/mllib/SVMWithSGDExample.scala" in the Spark repo.</small></div>

     <p>The <code class="language-plaintext highlighter-rouge">SVMWithSGD.train()</code> method by default performs L2 regularization with the
 regularization parameter set to 1.0. If we want to configure this algorithm, we
 can customize <code class="language-plaintext highlighter-rouge">SVMWithSGD</code> further by creating a new object directly and
 calling setter methods. All other <code class="language-plaintext highlighter-rouge">spark.mllib</code> algorithms support customization in
 this way as well. For example, the following code produces an L1 regularized
 variant of SVMs with regularization parameter set to 0.1, and runs the training
 algorithm for 200 iterations.</p>

     <figure class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.mllib.optimization.L1Updater</span>

 <span class="k">val</span> <span class="nv">svmAlg</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">SVMWithSGD</span><span class="o">()</span>
 <span class="nv">svmAlg</span><span class="o">.</span><span class="py">optimizer</span>
   <span class="o">.</span><span class="py">setNumIterations</span><span class="o">(</span><span class="mi">200</span><span class="o">)</span>
   <span class="o">.</span><span class="py">setRegParam</span><span class="o">(</span><span class="mf">0.1</span><span class="o">)</span>
   <span class="o">.</span><span class="py">setUpdater</span><span class="o">(</span><span class="k">new</span> <span class="n">L1Updater</span><span class="o">)</span>
 <span class="k">val</span> <span class="nv">modelL1</span> <span class="k">=</span> <span class="nv">svmAlg</span><span class="o">.</span><span class="py">run</span><span class="o">(</span><span class="n">training</span><span class="o">)</span></code></pre></figure>

   </div>

 <div data-lang="java">
     <p>All of MLlib&#8217;s methods use Java-friendly types, so you can import and call them there the same
 way you do in Scala. The only caveat is that the methods take Scala RDD objects, while the
 Spark Java API uses a separate <code class="language-plaintext highlighter-rouge">JavaRDD</code> class. You can convert a Java RDD to a Scala one by
 calling <code class="language-plaintext highlighter-rouge">.rdd()</code> on your <code class="language-plaintext highlighter-rouge">JavaRDD</code> object. A self-contained application example
 that is equivalent to the provided example in Scala is given below:</p>

     <p>Refer to the <a href="api/java/org/apache/spark/mllib/classification/SVMWithSGD.html"><code class="language-plaintext highlighter-rouge">SVMWithSGD</code> Java docs</a> and <a href="api/java/org/apache/spark/mllib/classification/SVMModel.html"><code class="language-plaintext highlighter-rouge">SVMModel</code> Java docs</a> for details on the API.</p>

     <div class="highlight"><pre class="codehilite"><code><span class="kn">import</span> <span class="nn">scala.Tuple2</span><span class="o">;</span>

 <span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
 <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.classification.SVMModel</span><span class="o">;</span>
 <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.classification.SVMWithSGD</span><span class="o">;</span>
 <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.evaluation.BinaryClassificationMetrics</span><span class="o">;</span>
 <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span><span class="o">;</span>
 <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span><span class="o">;</span>

 <span class="nc">String</span> <span class="n">path</span> <span class="o">=</span> <span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">;</span>
 <span class="nc">JavaRDD</span><span class="o">&lt;</span><span class="nc">LabeledPoint</span><span class="o">&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="na">loadLibSVMFile</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="n">path</span><span class="o">).</span><span class="na">toJavaRDD</span><span class="o">();</span>

 <span class="c1">// Split initial RDD into two... [60% training data, 40% testing data].</span>
 <span class="nc">JavaRDD</span><span class="o">&lt;</span><span class="nc">LabeledPoint</span><span class="o">&gt;</span> <span class="n">training</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="na">sample</span><span class="o">(</span><span class="kc">false</span><span class="o">,</span> <span class="mf">0.6</span><span class="o">,</span> <span class="mi">11L</span><span class="o">);</span>
 <span class="n">training</span><span class="o">.</span><span class="na">cache</span><span class="o">();</span>
 <span class="nc">JavaRDD</span><span class="o">&lt;</span><span class="nc">LabeledPoint</span><span class="o">&gt;</span> <span class="n">test</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="na">subtract</span><span class="o">(</span><span class="n">training</span><span class="o">);</span>

 <span class="c1">// Run training algorithm to build the model.</span>
 <span class="kt">int</span> <span class="n">numIterations</span> <span class="o">=</span> <span class="mi">100</span><span class="o">;</span>
 <span class="nc">SVMModel</span> <span class="n">model</span> <span class="o">=</span> <span class="nc">SVMWithSGD</span><span class="o">.</span><span class="na">train</span><span class="o">(</span><span class="n">training</span><span class="o">.</span><span class="na">rdd</span><span class="o">(),</span> <span class="n">numIterations</span><span class="o">);</span>

 <span class="c1">// Clear the default threshold.</span>
 <span class="n">model</span><span class="o">.</span><span class="na">clearThreshold</span><span class="o">();</span>

 <span class="c1">// Compute raw scores on the test set.</span>
 <span class="nc">JavaRDD</span><span class="o">&lt;</span><span class="nc">Tuple2</span><span class="o">&lt;</span><span class="nc">Object</span><span class="o">,</span> <span class="nc">Object</span><span class="o">&gt;&gt;</span> <span class="n">scoreAndLabels</span> <span class="o">=</span> <span class="n">test</span><span class="o">.</span><span class="na">map</span><span class="o">(</span><span class="n">p</span> <span class="o">-&gt;</span>
   <span class="k">new</span> <span class="nc">Tuple2</span><span class="o">&lt;&gt;(</span><span class="n">model</span><span class="o">.</span><span class="na">predict</span><span class="o">(</span><span class="n">p</span><span class="o">.</span><span class="na">features</span><span class="o">()),</span> <span class="n">p</span><span class="o">.</span><span class="na">label</span><span class="o">()));</span>

 <span class="c1">// Get evaluation metrics.</span>
 <span class="nc">BinaryClassificationMetrics</span> <span class="n">metrics</span> <span class="o">=</span>
   <span class="k">new</span> <span class="nf">BinaryClassificationMetrics</span><span class="o">(</span><span class="nc">JavaRDD</span><span class="o">.</span><span class="na">toRDD</span><span class="o">(</span><span class="n">scoreAndLabels</span><span class="o">));</span>
 <span class="kt">double</span> <span class="n">auROC</span> <span class="o">=</span> <span class="n">metrics</span><span class="o">.</span><span class="na">areaUnderROC</span><span class="o">();</span>

 <span class="nc">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="s">"Area under ROC = "</span> <span class="o">+</span> <span class="n">auROC</span><span class="o">);</span>

 <span class="c1">// Save and load model</span>
 <span class="n">model</span><span class="o">.</span><span class="na">save</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">"target/tmp/javaSVMWithSGDModel"</span><span class="o">);</span>
 <span class="nc">SVMModel</span> <span class="n">sameModel</span> <span class="o">=</span> <span class="nc">SVMModel</span><span class="o">.</span><span class="na">load</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">"target/tmp/javaSVMWithSGDModel"</span><span class="o">);</span></code></pre></div>
     <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/mllib/JavaSVMWithSGDExample.java" in the Spark repo.</small></div>

     <p>The <code class="language-plaintext highlighter-rouge">SVMWithSGD.train()</code> method by default performs L2 regularization with the
 regularization parameter set to 1.0. If we want to configure this algorithm, we
 can customize <code class="language-plaintext highlighter-rouge">SVMWithSGD</code> further by creating a new object directly and
 calling setter methods. All other <code class="language-plaintext highlighter-rouge">spark.mllib</code> algorithms support customization in
 this way as well. For example, the following code produces an L1 regularized
 variant of SVMs with regularization parameter set to 0.1, and runs the training
 algorithm for 200 iterations.</p>

     <figure class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">org.apache.spark.mllib.optimization.L1Updater</span><span class="o">;</span>

 <span class="nc">SVMWithSGD</span> <span class="n">svmAlg</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">SVMWithSGD</span><span class="o">();</span>
 <span class="n">svmAlg</span><span class="o">.</span><span class="na">optimizer</span><span class="o">()</span>
   <span class="o">.</span><span class="na">setNumIterations</span><span class="o">(</span><span class="mi">200</span><span class="o">)</span>
   <span class="o">.</span><span class="na">setRegParam</span><span class="o">(</span><span class="mf">0.1</span><span class="o">)</span>
   <span class="o">.</span><span class="na">setUpdater</span><span class="o">(</span><span class="k">new</span> <span class="nc">L1Updater</span><span class="o">());</span>
 <span class="nc">SVMModel</span> <span class="n">modelL1</span> <span class="o">=</span> <span class="n">svmAlg</span><span class="o">.</span><span class="na">run</span><span class="o">(</span><span class="n">training</span><span class="o">.</span><span class="na">rdd</span><span class="o">());</span></code></pre></figure>

     <p>In order to run the above application, follow the instructions
 provided in the <a href="quick-start.html#self-contained-applications">Self-Contained
 Applications</a> section of the Spark
 quick-start guide. Be sure to also include <em>spark-mllib</em> to your build file as
 a dependency.</p>
   </div>

 </div>

 <h3 id="logistic-regression">Logistic regression</h3>

 <p><a href="http://en.wikipedia.org/wiki/Logistic_regression">Logistic regression</a> is widely used to predict a
 binary response. It is a linear method as described above in equation <code class="language-plaintext highlighter-rouge">$\eqref{eq:regPrimal}$</code>,
 with the loss function in the formulation given by the logistic loss:
 <code class="language-plaintext highlighter-rouge">\[
 L(\wv;\x,y) :=  \log(1+\exp( -y \wv^T \x)).
 \]</code></p>

 <p>For binary classification problems, the algorithm outputs a binary logistic regression model.
 Given a new data point, denoted by $\x$, the model makes predictions by
 applying the logistic function
 <code class="language-plaintext highlighter-rouge">\[
 \mathrm{f}(z) = \frac{1}{1 + e^{-z}}
 \]</code>
 where $z = \wv^T \x$.
 By default, if $\mathrm{f}(\wv^T x) &gt; 0.5$, the outcome is positive, or
 negative otherwise, though unlike linear SVMs, the raw output of the logistic regression
 model, $\mathrm{f}(z)$, has a probabilistic interpretation (i.e., the probability
 that $\x$ is positive).</p>

 <p>Binary logistic regression can be generalized into
 <a href="http://en.wikipedia.org/wiki/Multinomial_logistic_regression">multinomial logistic regression</a> to
 train and predict multiclass classification problems.
 For example, for $K$ possible outcomes, one of the outcomes can be chosen as a &#8220;pivot&#8221;, and the
 other $K - 1$ outcomes can be separately regressed against the pivot outcome.
 In <code class="language-plaintext highlighter-rouge">spark.mllib</code>, the first class $0$ is chosen as the &#8220;pivot&#8221; class.
 See Section 4.4 of
 <a href="http://statweb.stanford.edu/~tibs/ElemStatLearn/">The Elements of Statistical Learning</a> for
 references.
 Here is a
 <a href="http://www.slideshare.net/dbtsai/2014-0620-mlor-36132297">detailed mathematical derivation</a>.</p>

 <p>For multiclass classification problems, the algorithm will output a multinomial logistic regression
 model, which contains $K - 1$ binary logistic regression models regressed against the first class.
 Given a new data points, $K - 1$ models will be run, and the class with largest probability will be
 chosen as the predicted class.</p>

 <p>We implemented two algorithms to solve logistic regression: mini-batch gradient descent and L-BFGS.
 We recommend L-BFGS over mini-batch gradient descent for faster convergence.</p>

 <p><strong>Examples</strong></p>

 <div class="codetabs">

 <div data-lang="python">
     <p>The following example shows how to load a sample dataset, build Logistic Regression model,
 and make predictions with the resulting model to compute the training error.</p>

     <p>Note that the Python API does not yet support multiclass classification and model save/load but
 will in the future.</p>

     <p>Refer to the <a href="api/python/reference/api/pyspark.mllib.classification.LogisticRegressionWithLBFGS.html"><code class="language-plaintext highlighter-rouge">LogisticRegressionWithLBFGS</code> Python docs</a> and <a href="api/python/reference/api/pyspark.mllib.classification.LogisticRegressionModel.html"><code class="language-plaintext highlighter-rouge">LogisticRegressionModel</code> Python docs</a> for more details on the API.</p>

     <div class="highlight"><pre class="codehilite"><code><span class="kn">from</span> <span class="nn">pyspark.mllib.classification</span> <span class="kn">import</span> <span class="n">LogisticRegressionWithLBFGS</span><span class="p">,</span> <span class="n">LogisticRegressionModel</span>
 <span class="kn">from</span> <span class="nn">pyspark.mllib.regression</span> <span class="kn">import</span> <span class="n">LabeledPoint</span>

 <span class="c1"># Load and parse the data
 </span><span class="k">def</span> <span class="nf">parsePoint</span><span class="p">(</span><span class="n">line</span><span class="p">):</span>
     <span class="n">values</span> <span class="o">=</span> <span class="p">[</span><span class="nb">float</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="k">for</span> <span class="n">x</span> <span class="ow">in</span> <span class="n">line</span><span class="p">.</span><span class="n">split</span><span class="p">(</span><span class="s">' '</span><span class="p">)]</span>
     <span class="k">return</span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="n">values</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">values</span><span class="p">[</span><span class="mi">1</span><span class="p">:])</span>

 <span class="n">data</span> <span class="o">=</span> <span class="n">sc</span><span class="p">.</span><span class="n">textFile</span><span class="p">(</span><span class="s">"data/mllib/sample_svm_data.txt"</span><span class="p">)</span>
 <span class="n">parsedData</span> <span class="o">=</span> <span class="n">data</span><span class="p">.</span><span class="nb">map</span><span class="p">(</span><span class="n">parsePoint</span><span class="p">)</span>

 <span class="c1"># Build the model
 </span><span class="n">model</span> <span class="o">=</span> <span class="n">LogisticRegressionWithLBFGS</span><span class="p">.</span><span class="n">train</span><span class="p">(</span><span class="n">parsedData</span><span class="p">)</span>

 <span class="c1"># Evaluating the model on training data
 </span><span class="n">labelsAndPreds</span> <span class="o">=</span> <span class="n">parsedData</span><span class="p">.</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">p</span><span class="p">:</span> <span class="p">(</span><span class="n">p</span><span class="p">.</span><span class="n">label</span><span class="p">,</span> <span class="n">model</span><span class="p">.</span><span class="n">predict</span><span class="p">(</span><span class="n">p</span><span class="p">.</span><span class="n">features</span><span class="p">)))</span>
 <span class="n">trainErr</span> <span class="o">=</span> <span class="n">labelsAndPreds</span><span class="p">.</span><span class="nb">filter</span><span class="p">(</span><span class="k">lambda</span> <span class="n">lp</span><span class="p">:</span> <span class="n">lp</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span> <span class="o">!=</span> <span class="n">lp</span><span class="p">[</span><span class="mi">1</span><span class="p">]).</span><span class="n">count</span><span class="p">()</span> <span class="o">/</span> <span class="nb">float</span><span class="p">(</span><span class="n">parsedData</span><span class="p">.</span><span class="n">count</span><span class="p">())</span>
 <span class="k">print</span><span class="p">(</span><span class="s">"Training Error = "</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">trainErr</span><span class="p">))</span>

 <span class="c1"># Save and load model
 </span><span class="n">model</span><span class="p">.</span><span class="n">save</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span> <span class="s">"target/tmp/pythonLogisticRegressionWithLBFGSModel"</span><span class="p">)</span>
 <span class="n">sameModel</span> <span class="o">=</span> <span class="n">LogisticRegressionModel</span><span class="p">.</span><span class="n">load</span><span class="p">(</span><span class="n">sc</span><span class="p">,</span>
                                          <span class="s">"target/tmp/pythonLogisticRegressionWithLBFGSModel"</span><span class="p">)</span></code></pre></div>
     <div><small>Find full example code at "examples/src/main/python/mllib/logistic_regression_with_lbfgs_example.py" in the Spark repo.</small></div>
   </div>

 <div data-lang="scala">
     <p>The following code illustrates how to load a sample multiclass dataset, split it into train and
 test, and use
 <a href="api/scala/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html">LogisticRegressionWithLBFGS</a>
 to fit a logistic regression model.
 Then the model is evaluated against the test dataset and saved to disk.</p>

     <p>Refer to the <a href="api/scala/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html"><code class="language-plaintext highlighter-rouge">LogisticRegressionWithLBFGS</code> Scala docs</a> and <a href="api/scala/org/apache/spark/mllib/classification/LogisticRegressionModel.html"><code class="language-plaintext highlighter-rouge">LogisticRegressionModel</code> Scala docs</a> for details on the API.</p>

     <div class="highlight"><pre class="codehilite"><code><span class="k">import</span> <span class="nn">org.apache.spark.mllib.classification.</span><span class="o">{</span><span class="nc">LogisticRegressionModel</span><span class="o">,</span> <span class="nc">LogisticRegressionWithLBFGS</span><span class="o">}</span>
 <span class="k">import</span> <span class="nn">org.apache.spark.mllib.evaluation.MulticlassMetrics</span>
 <span class="k">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span>
 <span class="k">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span>

 <span class="c1">// Load training data in LIBSVM format.</span>
 <span class="k">val</span> <span class="nv">data</span> <span class="k">=</span> <span class="nv">MLUtils</span><span class="o">.</span><span class="py">loadLibSVMFile</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">)</span>

 <span class="c1">// Split data into training (60%) and test (40%).</span>
 <span class="k">val</span> <span class="nv">splits</span> <span class="k">=</span> <span class="nv">data</span><span class="o">.</span><span class="py">randomSplit</span><span class="o">(</span><span class="nc">Array</span><span class="o">(</span><span class="mf">0.6</span><span class="o">,</span> <span class="mf">0.4</span><span class="o">),</span> <span class="n">seed</span> <span class="k">=</span> <span class="mi">11L</span><span class="o">)</span>
 <span class="k">val</span> <span class="nv">training</span> <span class="k">=</span> <span class="nf">splits</span><span class="o">(</span><span class="mi">0</span><span class="o">).</span><span class="py">cache</span><span class="o">()</span>
 <span class="k">val</span> <span class="nv">test</span> <span class="k">=</span> <span class="nf">splits</span><span class="o">(</span><span class="mi">1</span><span class="o">)</span>

 <span class="c1">// Run training algorithm to build the model</span>
 <span class="k">val</span> <span class="nv">model</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">LogisticRegressionWithLBFGS</span><span class="o">()</span>
   <span class="o">.</span><span class="py">setNumClasses</span><span class="o">(</span><span class="mi">10</span><span class="o">)</span>
   <span class="o">.</span><span class="py">run</span><span class="o">(</span><span class="n">training</span><span class="o">)</span>

 <span class="c1">// Compute raw scores on the test set.</span>
 <span class="k">val</span> <span class="nv">predictionAndLabels</span> <span class="k">=</span> <span class="nv">test</span><span class="o">.</span><span class="py">map</span> <span class="o">{</span> <span class="k">case</span> <span class="nc">LabeledPoint</span><span class="o">(</span><span class="n">label</span><span class="o">,</span> <span class="n">features</span><span class="o">)</span> <span class="k">=&gt;</span>
   <span class="k">val</span> <span class="nv">prediction</span> <span class="k">=</span> <span class="nv">model</span><span class="o">.</span><span class="py">predict</span><span class="o">(</span><span class="n">features</span><span class="o">)</span>
   <span class="o">(</span><span class="n">prediction</span><span class="o">,</span> <span class="n">label</span><span class="o">)</span>
 <span class="o">}</span>

 <span class="c1">// Get evaluation metrics.</span>
 <span class="k">val</span> <span class="nv">metrics</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">MulticlassMetrics</span><span class="o">(</span><span class="n">predictionAndLabels</span><span class="o">)</span>
 <span class="k">val</span> <span class="nv">accuracy</span> <span class="k">=</span> <span class="nv">metrics</span><span class="o">.</span><span class="py">accuracy</span>
 <span class="nf">println</span><span class="o">(</span><span class="n">s</span><span class="s">"Accuracy = $accuracy"</span><span class="o">)</span>

 <span class="c1">// Save and load model</span>
 <span class="nv">model</span><span class="o">.</span><span class="py">save</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">"target/tmp/scalaLogisticRegressionWithLBFGSModel"</span><span class="o">)</span>
 <span class="k">val</span> <span class="nv">sameModel</span> <span class="k">=</span> <span class="nv">LogisticRegressionModel</span><span class="o">.</span><span class="py">load</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span>
   <span class="s">"target/tmp/scalaLogisticRegressionWithLBFGSModel"</span><span class="o">)</span></code></pre></div>
     <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/mllib/LogisticRegressionWithLBFGSExample.scala" in the Spark repo.</small></div>

   </div>

 <div data-lang="java">
     <p>The following code illustrates how to load a sample multiclass dataset, split it into train and
 test, and use
 <a href="api/java/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html">LogisticRegressionWithLBFGS</a>
 to fit a logistic regression model.
 Then the model is evaluated against the test dataset and saved to disk.</p>

     <p>Refer to the <a href="api/java/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html"><code class="language-plaintext highlighter-rouge">LogisticRegressionWithLBFGS</code> Java docs</a> and <a href="api/java/org/apache/spark/mllib/classification/LogisticRegressionModel.html"><code class="language-plaintext highlighter-rouge">LogisticRegressionModel</code> Java docs</a> for details on the API.</p>

     <div class="highlight"><pre class="codehilite"><code><span class="kn">import</span> <span class="nn">scala.Tuple2</span><span class="o">;</span>

 <span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaPairRDD</span><span class="o">;</span>
 <span class="kn">import</span> <span class="nn">org.apache.spark.api.java.JavaRDD</span><span class="o">;</span>
 <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.classification.LogisticRegressionModel</span><span class="o">;</span>
 <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS</span><span class="o">;</span>
 <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.evaluation.MulticlassMetrics</span><span class="o">;</span>
 <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span><span class="o">;</span>
 <span class="kn">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span><span class="o">;</span>

 <span class="nc">String</span> <span class="n">path</span> <span class="o">=</span> <span class="s">"data/mllib/sample_libsvm_data.txt"</span><span class="o">;</span>
 <span class="nc">JavaRDD</span><span class="o">&lt;</span><span class="nc">LabeledPoint</span><span class="o">&gt;</span> <span class="n">data</span> <span class="o">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="na">loadLibSVMFile</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="n">path</span><span class="o">).</span><span class="na">toJavaRDD</span><span class="o">();</span>

 <span class="c1">// Split initial RDD into two... [60% training data, 40% testing data].</span>
 <span class="nc">JavaRDD</span><span class="o">&lt;</span><span class="nc">LabeledPoint</span><span class="o">&gt;[]</span> <span class="n">splits</span> <span class="o">=</span> <span class="n">data</span><span class="o">.</span><span class="na">randomSplit</span><span class="o">(</span><span class="k">new</span> <span class="kt">double</span><span class="o">[]</span> <span class="o">{</span><span class="mf">0.6</span><span class="o">,</span> <span class="mf">0.4</span><span class="o">},</span> <span class="mi">11L</span><span class="o">);</span>
 <span class="nc">JavaRDD</span><span class="o">&lt;</span><span class="nc">LabeledPoint</span><span class="o">&gt;</span> <span class="n">training</span> <span class="o">=</span> <span class="n">splits</span><span class="o">[</span><span class="mi">0</span><span class="o">].</span><span class="na">cache</span><span class="o">();</span>
 <span class="nc">JavaRDD</span><span class="o">&lt;</span><span class="nc">LabeledPoint</span><span class="o">&gt;</span> <span class="n">test</span> <span class="o">=</span> <span class="n">splits</span><span class="o">[</span><span class="mi">1</span><span class="o">];</span>

 <span class="c1">// Run training algorithm to build the model.</span>
 <span class="nc">LogisticRegressionModel</span> <span class="n">model</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">LogisticRegressionWithLBFGS</span><span class="o">()</span>
   <span class="o">.</span><span class="na">setNumClasses</span><span class="o">(</span><span class="mi">10</span><span class="o">)</span>
   <span class="o">.</span><span class="na">run</span><span class="o">(</span><span class="n">training</span><span class="o">.</span><span class="na">rdd</span><span class="o">());</span>

 <span class="c1">// Compute raw scores on the test set.</span>
 <span class="nc">JavaPairRDD</span><span class="o">&lt;</span><span class="nc">Object</span><span class="o">,</span> <span class="nc">Object</span><span class="o">&gt;</span> <span class="n">predictionAndLabels</span> <span class="o">=</span> <span class="n">test</span><span class="o">.</span><span class="na">mapToPair</span><span class="o">(</span><span class="n">p</span> <span class="o">-&gt;</span>
   <span class="k">new</span> <span class="nc">Tuple2</span><span class="o">&lt;&gt;(</span><span class="n">model</span><span class="o">.</span><span class="na">predict</span><span class="o">(</span><span class="n">p</span><span class="o">.</span><span class="na">features</span><span class="o">()),</span> <span class="n">p</span><span class="o">.</span><span class="na">label</span><span class="o">()));</span>

 <span class="c1">// Get evaluation metrics.</span>
 <span class="nc">MulticlassMetrics</span> <span class="n">metrics</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">MulticlassMetrics</span><span class="o">(</span><span class="n">predictionAndLabels</span><span class="o">.</span><span class="na">rdd</span><span class="o">());</span>
 <span class="kt">double</span> <span class="n">accuracy</span> <span class="o">=</span> <span class="n">metrics</span><span class="o">.</span><span class="na">accuracy</span><span class="o">();</span>
 <span class="nc">System</span><span class="o">.</span><span class="na">out</span><span class="o">.</span><span class="na">println</span><span class="o">(</span><span class="s">"Accuracy = "</span> <span class="o">+</span> <span class="n">accuracy</span><span class="o">);</span>

 <span class="c1">// Save and load model</span>
 <span class="n">model</span><span class="o">.</span><span class="na">save</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span> <span class="s">"target/tmp/javaLogisticRegressionWithLBFGSModel"</span><span class="o">);</span>
 <span class="nc">LogisticRegressionModel</span> <span class="n">sameModel</span> <span class="o">=</span> <span class="nc">LogisticRegressionModel</span><span class="o">.</span><span class="na">load</span><span class="o">(</span><span class="n">sc</span><span class="o">,</span>
   <span class="s">"target/tmp/javaLogisticRegressionWithLBFGSModel"</span><span class="o">);</span></code></pre></div>
     <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/mllib/JavaLogisticRegressionWithLBFGSExample.java" in the Spark repo.</small></div>
   </div>

 </div>

 <h1 id="regression">Regression</h1>

 <h3 id="linear-least-squares-lasso-and-ridge-regression">Linear least squares, Lasso, and ridge regression</h3>

 <p>Linear least squares is the most common formulation for regression problems.
 It is a linear method as described above in equation <code class="language-plaintext highlighter-rouge">$\eqref{eq:regPrimal}$</code>, with the loss
 function in the formulation given by the squared loss:
 <code class="language-plaintext highlighter-rouge">\[
 L(\wv;\x,y) :=  \frac{1}{2} (\wv^T \x - y)^2.
 \]</code></p>

 <p>Various related regression methods are derived by using different types of regularization:
 <a href="http://en.wikipedia.org/wiki/Ordinary_least_squares"><em>ordinary least squares</em></a> or
 <a href="http://en.wikipedia.org/wiki/Linear_least_squares_(mathematics)"><em>linear least squares</em></a> uses
  no regularization; <a href="http://en.wikipedia.org/wiki/Ridge_regression"><em>ridge regression</em></a> uses L2
 regularization; and <a href="http://en.wikipedia.org/wiki/Lasso_(statistics)"><em>Lasso</em></a> uses L1
 regularization.  For all of these models, the average loss or training error, $\frac{1}{n} \sum_{i=1}^n (\wv^T x_i - y_i)^2$, is
 known as the <a href="http://en.wikipedia.org/wiki/Mean_squared_error">mean squared error</a>.</p>

 <h3 id="streaming-linear-regression">Streaming linear regression</h3>

 <p>When data arrive in a streaming fashion, it is useful to fit regression models online,
 updating the parameters of the model as new data arrives. <code class="language-plaintext highlighter-rouge">spark.mllib</code> currently supports
 streaming linear regression using ordinary least squares. The fitting is similar
 to that performed offline, except fitting occurs on each batch of data, so that
 the model continually updates to reflect the data from the stream.</p>

 <p><strong>Examples</strong></p>

 <p>The following example demonstrates how to load training and testing data from two different
 input streams of text files, parse the streams as labeled points, fit a linear regression model
 online to the first stream, and make predictions on the second stream.</p>

 <div class="codetabs">

 <div data-lang="python">

     <p>First, we import the necessary classes for parsing our input data and creating the model.</p>

     <p>Then we make input streams for training and testing data. We assume a StreamingContext <code class="language-plaintext highlighter-rouge">ssc</code>
 has already been created, see <a href="streaming-programming-guide.html#initializing">Spark Streaming Programming Guide</a>
 for more info. For this example, we use labeled points in training and testing streams,
 but in practice you will likely want to use unlabeled vectors for test data.</p>

     <p>We create our model by initializing the weights to 0.</p>

     <p>Now we register the streams for training and testing and start the job.</p>

     <p>We can now save text files with data to the training or testing folders.
 Each line should be a data point formatted as <code class="language-plaintext highlighter-rouge">(y,[x1,x2,x3])</code> where <code class="language-plaintext highlighter-rouge">y</code> is the label
 and <code class="language-plaintext highlighter-rouge">x1,x2,x3</code> are the features. Anytime a text file is placed in <code class="language-plaintext highlighter-rouge">sys.argv[1]</code>
 the model will update. Anytime a text file is placed in <code class="language-plaintext highlighter-rouge">sys.argv[2]</code> you will see predictions.
 As you feed more data to the training directory, the predictions
 will get better!</p>

     <p>Here a complete example:</p>
     <div class="highlight"><pre class="codehilite"><code><span class="kn">import</span> <span class="nn">sys</span>

 <span class="kn">from</span> <span class="nn">pyspark.mllib.linalg</span> <span class="kn">import</span> <span class="n">Vectors</span>
 <span class="kn">from</span> <span class="nn">pyspark.mllib.regression</span> <span class="kn">import</span> <span class="n">LabeledPoint</span>
 <span class="kn">from</span> <span class="nn">pyspark.mllib.regression</span> <span class="kn">import</span> <span class="n">StreamingLinearRegressionWithSGD</span>

 <span class="k">def</span> <span class="nf">parse</span><span class="p">(</span><span class="n">lp</span><span class="p">):</span>
     <span class="n">label</span> <span class="o">=</span> <span class="nb">float</span><span class="p">(</span><span class="n">lp</span><span class="p">[</span><span class="n">lp</span><span class="p">.</span><span class="n">find</span><span class="p">(</span><span class="s">'('</span><span class="p">)</span> <span class="o">+</span> <span class="mi">1</span><span class="p">:</span> <span class="n">lp</span><span class="p">.</span><span class="n">find</span><span class="p">(</span><span class="s">','</span><span class="p">)])</span>
     <span class="n">vec</span> <span class="o">=</span> <span class="n">Vectors</span><span class="p">.</span><span class="n">dense</span><span class="p">(</span><span class="n">lp</span><span class="p">[</span><span class="n">lp</span><span class="p">.</span><span class="n">find</span><span class="p">(</span><span class="s">'['</span><span class="p">)</span> <span class="o">+</span> <span class="mi">1</span><span class="p">:</span> <span class="n">lp</span><span class="p">.</span><span class="n">find</span><span class="p">(</span><span class="s">']'</span><span class="p">)].</span><span class="n">split</span><span class="p">(</span><span class="s">','</span><span class="p">))</span>
     <span class="k">return</span> <span class="n">LabeledPoint</span><span class="p">(</span><span class="n">label</span><span class="p">,</span> <span class="n">vec</span><span class="p">)</span>

 <span class="n">trainingData</span> <span class="o">=</span> <span class="n">ssc</span><span class="p">.</span><span class="n">textFileStream</span><span class="p">(</span><span class="n">sys</span><span class="p">.</span><span class="n">argv</span><span class="p">[</span><span class="mi">1</span><span class="p">]).</span><span class="nb">map</span><span class="p">(</span><span class="n">parse</span><span class="p">).</span><span class="n">cache</span><span class="p">()</span>
 <span class="n">testData</span> <span class="o">=</span> <span class="n">ssc</span><span class="p">.</span><span class="n">textFileStream</span><span class="p">(</span><span class="n">sys</span><span class="p">.</span><span class="n">argv</span><span class="p">[</span><span class="mi">2</span><span class="p">]).</span><span class="nb">map</span><span class="p">(</span><span class="n">parse</span><span class="p">)</span>

 <span class="n">numFeatures</span> <span class="o">=</span> <span class="mi">3</span>
 <span class="n">model</span> <span class="o">=</span> <span class="n">StreamingLinearRegressionWithSGD</span><span class="p">()</span>
 <span class="n">model</span><span class="p">.</span><span class="n">setInitialWeights</span><span class="p">([</span><span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">,</span> <span class="mf">0.0</span><span class="p">])</span>

 <span class="n">model</span><span class="p">.</span><span class="n">trainOn</span><span class="p">(</span><span class="n">trainingData</span><span class="p">)</span>
 <span class="k">print</span><span class="p">(</span><span class="n">model</span><span class="p">.</span><span class="n">predictOnValues</span><span class="p">(</span><span class="n">testData</span><span class="p">.</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">lp</span><span class="p">:</span> <span class="p">(</span><span class="n">lp</span><span class="p">.</span><span class="n">label</span><span class="p">,</span> <span class="n">lp</span><span class="p">.</span><span class="n">features</span><span class="p">))))</span>

 <span class="n">ssc</span><span class="p">.</span><span class="n">start</span><span class="p">()</span>
 <span class="n">ssc</span><span class="p">.</span><span class="n">awaitTermination</span><span class="p">()</span></code></pre></div>
     <div><small>Find full example code at "examples/src/main/python/mllib/streaming_linear_regression_example.py" in the Spark repo.</small></div>

   </div>

 <div data-lang="scala">

     <p>First, we import the necessary classes for parsing our input data and creating the model.</p>

     <p>Then we make input streams for training and testing data. We assume a StreamingContext <code class="language-plaintext highlighter-rouge">ssc</code>
 has already been created, see <a href="streaming-programming-guide.html#initializing">Spark Streaming Programming Guide</a>
 for more info. For this example, we use labeled points in training and testing streams,
 but in practice you will likely want to use unlabeled vectors for test data.</p>

     <p>We create our model by initializing the weights to zero and register the streams for training and
 testing then start the job. Printing predictions alongside true labels lets us easily see the
 result.</p>

     <p>Finally, we can save text files with data to the training or testing folders.
 Each line should be a data point formatted as <code class="language-plaintext highlighter-rouge">(y,[x1,x2,x3])</code> where <code class="language-plaintext highlighter-rouge">y</code> is the label
 and <code class="language-plaintext highlighter-rouge">x1,x2,x3</code> are the features. Anytime a text file is placed in <code class="language-plaintext highlighter-rouge">args(0)</code>
 the model will update. Anytime a text file is placed in <code class="language-plaintext highlighter-rouge">args(1)</code> you will see predictions.
 As you feed more data to the training directory, the predictions
 will get better!</p>

     <p>Here is a complete example:</p>
     <div class="highlight"><pre class="codehilite"><code><span class="k">import</span> <span class="nn">org.apache.spark.mllib.linalg.Vectors</span>
 <span class="k">import</span> <span class="nn">org.apache.spark.mllib.regression.LabeledPoint</span>
 <span class="k">import</span> <span class="nn">org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD</span>

 <span class="k">val</span> <span class="nv">trainingData</span> <span class="k">=</span> <span class="nv">ssc</span><span class="o">.</span><span class="py">textFileStream</span><span class="o">(</span><span class="nf">args</span><span class="o">(</span><span class="mi">0</span><span class="o">)).</span><span class="py">map</span><span class="o">(</span><span class="nv">LabeledPoint</span><span class="o">.</span><span class="py">parse</span><span class="o">).</span><span class="py">cache</span><span class="o">()</span>
 <span class="k">val</span> <span class="nv">testData</span> <span class="k">=</span> <span class="nv">ssc</span><span class="o">.</span><span class="py">textFileStream</span><span class="o">(</span><span class="nf">args</span><span class="o">(</span><span class="mi">1</span><span class="o">)).</span><span class="py">map</span><span class="o">(</span><span class="nv">LabeledPoint</span><span class="o">.</span><span class="py">parse</span><span class="o">)</span>

 <span class="k">val</span> <span class="nv">numFeatures</span> <span class="k">=</span> <span class="mi">3</span>
 <span class="k">val</span> <span class="nv">model</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">StreamingLinearRegressionWithSGD</span><span class="o">()</span>
   <span class="o">.</span><span class="py">setInitialWeights</span><span class="o">(</span><span class="nv">Vectors</span><span class="o">.</span><span class="py">zeros</span><span class="o">(</span><span class="n">numFeatures</span><span class="o">))</span>

 <span class="nv">model</span><span class="o">.</span><span class="py">trainOn</span><span class="o">(</span><span class="n">trainingData</span><span class="o">)</span>
 <span class="nv">model</span><span class="o">.</span><span class="py">predictOnValues</span><span class="o">(</span><span class="nv">testData</span><span class="o">.</span><span class="py">map</span><span class="o">(</span><span class="n">lp</span> <span class="k">=&gt;</span> <span class="o">(</span><span class="nv">lp</span><span class="o">.</span><span class="py">label</span><span class="o">,</span> <span class="nv">lp</span><span class="o">.</span><span class="py">features</span><span class="o">))).</span><span class="py">print</span><span class="o">()</span>

 <span class="nv">ssc</span><span class="o">.</span><span class="py">start</span><span class="o">()</span>
 <span class="nv">ssc</span><span class="o">.</span><span class="py">awaitTermination</span><span class="o">()</span></code></pre></div>
     <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/mllib/StreamingLinearRegressionExample.scala" in the Spark repo.</small></div>

   </div>

 </div>

 <h1 id="implementation-developer">Implementation (developer)</h1>

 <p>Behind the scene, <code class="language-plaintext highlighter-rouge">spark.mllib</code> implements a simple distributed version of stochastic gradient descent
 (SGD), building on the underlying gradient descent primitive (as described in the <a href="mllib-optimization.html">optimization</a> section).  All provided algorithms take as input a
 regularization parameter (<code class="language-plaintext highlighter-rouge">regParam</code>) along with various parameters associated with stochastic
 gradient descent (<code class="language-plaintext highlighter-rouge">stepSize</code>, <code class="language-plaintext highlighter-rouge">numIterations</code>, <code class="language-plaintext highlighter-rouge">miniBatchFraction</code>).  For each of them, we support
 all three possible regularizations (none, L1 or L2).</p>

 <p>For Logistic Regression, <a href="api/scala/org/apache/spark/mllib/optimization/LBFGS.html">L-BFGS</a>
 version is implemented under <a href="api/scala/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html">LogisticRegressionWithLBFGS</a>, and this
 version supports both binary and multinomial Logistic Regression while SGD version only supports
 binary Logistic Regression. However, L-BFGS version doesn&#8217;t support L1 regularization but SGD one
 supports L1 regularization. When L1 regularization is not required, L-BFGS version is strongly
 recommended since it converges faster and more accurately compared to SGD by approximating the
 inverse Hessian matrix using quasi-Newton method.</p>

 <p>Algorithms are all implemented in Scala:</p>

 <ul>
   <li><a href="api/scala/org/apache/spark/mllib/classification/SVMWithSGD.html">SVMWithSGD</a></li>
   <li><a href="api/scala/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html">LogisticRegressionWithLBFGS</a></li>
   <li><a href="api/scala/org/apache/spark/mllib/classification/LogisticRegressionWithSGD.html">LogisticRegressionWithSGD</a></li>
   <li><a href="api/scala/org/apache/spark/mllib/regression/LinearRegressionWithSGD.html">LinearRegressionWithSGD</a></li>
   <li><a href="api/scala/org/apache/spark/mllib/regression/RidgeRegressionWithSGD.html">RidgeRegressionWithSGD</a></li>
   <li><a href="api/scala/org/apache/spark/mllib/regression/LassoWithSGD.html">LassoWithSGD</a></li>
 </ul>


                 </div>

              <!-- /container -->
         </div>

         <script src="js/vendor/jquery-3.5.1.min.js"></script>
         <script src="js/vendor/bootstrap.bundle.min.js"></script>

         <script src="js/vendor/anchor.min.js"></script>
         <script src="js/main.js"></script>

         <script type="text/javascript" src="js/vendor/docsearch.min.js"></script>
         <script type="text/javascript">
             // DocSearch is entirely free and automated. DocSearch is built in two parts:
             // 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link
             //    in your website and extract content from every page it traverses. It then pushes this
             //    content to an Algolia index.
             // 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index
             //    to your search input and display its results in a dropdown UI. If you want to find more
             //    details on how works DocSearch, check the docs of DocSearch.
             docsearch({
     apiKey: 'd62f962a82bc9abb53471cb7b89da35e',
     appId: 'RAI69RXRSK',
     indexName: 'apache_spark',
     inputSelector: '#docsearch-input',
     enhancedSearchInput: true,
     algoliaOptions: {
       'facetFilters': ["version:3.5.0"]
     },
     debug: false // Set debug to true if you want to inspect the dropdown
 });

         </script>

         <!-- MathJax Section -->
         <script type="text/x-mathjax-config">
             MathJax.Hub.Config({
                 TeX: { equationNumbers: { autoNumber: "AMS" } }
             });
         </script>
         <script>
             // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
             // We could use "//cdn.mathjax...", but that won't support "file://".
             (function(d, script) {
                 script = d.createElement('script');
                 script.type = 'text/javascript';
                 script.async = true;
                 script.onload = function(){
                     MathJax.Hub.Config({
                         tex2jax: {
                             inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
                             displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
                             processEscapes: true,
                             skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
                         }
                     });
                 };
                 script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
                     'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' +
                     '?config=TeX-AMS-MML_HTMLorMML';
                 d.getElementsByTagName('head')[0].appendChild(script);
             }(document));
         </script>
     </body>
 </html>