blob: 083cd437e8c35b2c4241b7eb30f57c8e9f57965d [file] [log] [blame]
<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<title>Migration Guide: MLlib (Machine Learning) - Spark 3.3.4 Documentation</title>
<link rel="stylesheet" href="css/bootstrap.min.css">
<style>
body {
padding-top: 60px;
padding-bottom: 40px;
}
</style>
<meta name="viewport" content="width=device-width">
<link rel="stylesheet" href="css/main.css">
<script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
<link rel="stylesheet" href="css/pygments-default.css">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.css" />
<link rel="stylesheet" href="css/docsearch.css">
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body>
<!--[if lt IE 7]>
<p class="chromeframe">You are using an outdated browser. <a href="https://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
<![endif]-->
<!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
<nav class="navbar fixed-top navbar-expand-md navbar-light bg-light" id="topbar">
<div class="container">
<div class="navbar-header">
<div class="navbar-brand"><a href="index.html">
<img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">3.3.4</span>
</div>
</div>
<button class="navbar-toggler" type="button" data-toggle="collapse"
data-target="#navbarCollapse" aria-controls="navbarCollapse"
aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarCollapse">
<ul class="navbar-nav">
<!--TODO(andyk): Add class="active" attribute to li some how.-->
<li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a>
<div class="dropdown-menu" aria-labelledby="navbarQuickStart">
<a class="dropdown-item" href="quick-start.html">Quick Start</a>
<a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a>
<a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a>
<a class="dropdown-item" href="structured-streaming-programming-guide.html">Structured Streaming</a>
<a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a>
<a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a>
<a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a>
<a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a>
<a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a>
<div class="dropdown-menu" aria-labelledby="navbarAPIDocs">
<a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a>
<a class="dropdown-item" href="api/java/index.html">Java</a>
<a class="dropdown-item" href="api/python/index.html">Python</a>
<a class="dropdown-item" href="api/R/index.html">R</a>
<a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a>
<div class="dropdown-menu" aria-labelledby="navbarDeploying">
<a class="dropdown-item" href="cluster-overview.html">Overview</a>
<a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a>
<a class="dropdown-item" href="running-on-mesos.html">Mesos</a>
<a class="dropdown-item" href="running-on-yarn.html">YARN</a>
<a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a>
<div class="dropdown-menu" aria-labelledby="navbarMore">
<a class="dropdown-item" href="configuration.html">Configuration</a>
<a class="dropdown-item" href="monitoring.html">Monitoring</a>
<a class="dropdown-item" href="tuning.html">Tuning Guide</a>
<a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a>
<a class="dropdown-item" href="security.html">Security</a>
<a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a>
<a class="dropdown-item" href="migration-guide.html">Migration Guide</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="building-spark.html">Building Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a>
</div>
</li>
<li class="nav-item">
<input type="text" id="docsearch-input" placeholder="Search the docs…">
</li>
</ul>
<!--<span class="navbar-text navbar-right"><span class="version-text">v3.3.4</span></span>-->
</div>
</div>
</nav>
<div class="container-wrapper">
<div class="left-menu-wrapper">
<div class="left-menu">
<h3><a href="ml-guide.html">MLlib: Main Guide</a></h3>
<ul>
<li>
<a href="ml-statistics.html">
Basic statistics
</a>
</li>
<li>
<a href="ml-datasource.html">
Data sources
</a>
</li>
<li>
<a href="ml-pipeline.html">
Pipelines
</a>
</li>
<li>
<a href="ml-features.html">
Extracting, transforming and selecting features
</a>
</li>
<li>
<a href="ml-classification-regression.html">
Classification and Regression
</a>
</li>
<li>
<a href="ml-clustering.html">
Clustering
</a>
</li>
<li>
<a href="ml-collaborative-filtering.html">
Collaborative filtering
</a>
</li>
<li>
<a href="ml-frequent-pattern-mining.html">
Frequent Pattern Mining
</a>
</li>
<li>
<a href="ml-tuning.html">
Model selection and tuning
</a>
</li>
<li>
<a href="ml-advanced.html">
Advanced topics
</a>
</li>
</ul>
<h3><a href="mllib-guide.html">MLlib: RDD-based API Guide</a></h3>
<ul>
<li>
<a href="mllib-data-types.html">
Data types
</a>
</li>
<li>
<a href="mllib-statistics.html">
Basic statistics
</a>
</li>
<li>
<a href="mllib-classification-regression.html">
Classification and regression
</a>
</li>
<li>
<a href="mllib-collaborative-filtering.html">
Collaborative filtering
</a>
</li>
<li>
<a href="mllib-clustering.html">
Clustering
</a>
</li>
<li>
<a href="mllib-dimensionality-reduction.html">
Dimensionality reduction
</a>
</li>
<li>
<a href="mllib-feature-extraction.html">
Feature extraction and transformation
</a>
</li>
<li>
<a href="mllib-frequent-pattern-mining.html">
Frequent pattern mining
</a>
</li>
<li>
<a href="mllib-evaluation-metrics.html">
Evaluation metrics
</a>
</li>
<li>
<a href="mllib-pmml-model-export.html">
PMML model export
</a>
</li>
<li>
<a href="mllib-optimization.html">
Optimization (developer)
</a>
</li>
</ul>
</div>
</div>
<input id="nav-trigger" class="nav-trigger" checked type="checkbox">
<label for="nav-trigger"></label>
<div class="content-with-sidebar mr-3" id="content">
<h1 class="title">Migration Guide: MLlib (Machine Learning)</h1>
<ul id="markdown-toc">
<li><a href="#upgrading-from-mllib-24-to-30" id="markdown-toc-upgrading-from-mllib-24-to-30">Upgrading from MLlib 2.4 to 3.0</a></li>
<li><a href="#upgrading-from-mllib-22-to-23" id="markdown-toc-upgrading-from-mllib-22-to-23">Upgrading from MLlib 2.2 to 2.3</a></li>
<li><a href="#upgrading-from-mllib-21-to-22" id="markdown-toc-upgrading-from-mllib-21-to-22">Upgrading from MLlib 2.1 to 2.2</a></li>
<li><a href="#upgrading-from-mllib-20-to-21" id="markdown-toc-upgrading-from-mllib-20-to-21">Upgrading from MLlib 2.0 to 2.1</a></li>
<li><a href="#upgrading-from-mllib-16-to-20" id="markdown-toc-upgrading-from-mllib-16-to-20">Upgrading from MLlib 1.6 to 2.0</a></li>
<li><a href="#upgrading-from-mllib-15-to-16" id="markdown-toc-upgrading-from-mllib-15-to-16">Upgrading from MLlib 1.5 to 1.6</a></li>
<li><a href="#upgrading-from-mllib-14-to-15" id="markdown-toc-upgrading-from-mllib-14-to-15">Upgrading from MLlib 1.4 to 1.5</a></li>
<li><a href="#upgrading-from-mllib-13-to-14" id="markdown-toc-upgrading-from-mllib-13-to-14">Upgrading from MLlib 1.3 to 1.4</a></li>
<li><a href="#upgrading-from-mllib-12-to-13" id="markdown-toc-upgrading-from-mllib-12-to-13">Upgrading from MLlib 1.2 to 1.3</a></li>
<li><a href="#upgrading-from-mllib-11-to-12" id="markdown-toc-upgrading-from-mllib-11-to-12">Upgrading from MLlib 1.1 to 1.2</a></li>
<li><a href="#upgrading-from-mllib-10-to-11" id="markdown-toc-upgrading-from-mllib-10-to-11">Upgrading from MLlib 1.0 to 1.1</a></li>
<li><a href="#upgrading-from-mllib-09-to-10" id="markdown-toc-upgrading-from-mllib-09-to-10">Upgrading from MLlib 0.9 to 1.0</a></li>
</ul>
<p>Note that this migration guide describes the items specific to MLlib.
Many items of SQL migration can be applied when migrating MLlib to higher versions for DataFrame-based APIs.
Please refer <a href="sql-migration-guide.html">Migration Guide: SQL, Datasets and DataFrame</a>.</p>
<h2 id="upgrading-from-mllib-24-to-30">Upgrading from MLlib 2.4 to 3.0</h2>
<h3 class="no_toc" id="breaking-changes">Breaking changes</h3>
<ul>
<li><code class="language-plaintext highlighter-rouge">OneHotEncoder</code> which is deprecated in 2.3, is removed in 3.0 and <code class="language-plaintext highlighter-rouge">OneHotEncoderEstimator</code> is now renamed to <code class="language-plaintext highlighter-rouge">OneHotEncoder</code>.</li>
<li><code class="language-plaintext highlighter-rouge">org.apache.spark.ml.image.ImageSchema.readImages</code> which is deprecated in 2.3, is removed in 3.0, use <code class="language-plaintext highlighter-rouge">spark.read.format('image')</code> instead.</li>
<li><code class="language-plaintext highlighter-rouge">org.apache.spark.mllib.clustering.KMeans.train</code> with param Int <code class="language-plaintext highlighter-rouge">runs</code> which is deprecated in 2.1, is removed in 3.0. Use <code class="language-plaintext highlighter-rouge">train</code> method without <code class="language-plaintext highlighter-rouge">runs</code> instead.</li>
<li><code class="language-plaintext highlighter-rouge">org.apache.spark.mllib.classification.LogisticRegressionWithSGD</code> which is deprecated in 2.0, is removed in 3.0, use <code class="language-plaintext highlighter-rouge">org.apache.spark.ml.classification.LogisticRegression</code> or <code class="language-plaintext highlighter-rouge">spark.mllib.classification.LogisticRegressionWithLBFGS</code> instead.</li>
<li><code class="language-plaintext highlighter-rouge">org.apache.spark.mllib.feature.ChiSqSelectorModel.isSorted </code> which is deprecated in 2.1, is removed in 3.0, is not intended for subclasses to use.</li>
<li><code class="language-plaintext highlighter-rouge">org.apache.spark.mllib.regression.RidgeRegressionWithSGD</code> which is deprecated in 2.0, is removed in 3.0, use <code class="language-plaintext highlighter-rouge">org.apache.spark.ml.regression.LinearRegression</code> with <code class="language-plaintext highlighter-rouge">elasticNetParam</code> = 0.0. Note the default <code class="language-plaintext highlighter-rouge">regParam</code> is 0.01 for <code class="language-plaintext highlighter-rouge">RidgeRegressionWithSGD</code>, but is 0.0 for <code class="language-plaintext highlighter-rouge">LinearRegression</code>.</li>
<li><code class="language-plaintext highlighter-rouge">org.apache.spark.mllib.regression.LassoWithSGD</code> which is deprecated in 2.0, is removed in 3.0, use <code class="language-plaintext highlighter-rouge">org.apache.spark.ml.regression.LinearRegression</code> with <code class="language-plaintext highlighter-rouge">elasticNetParam</code> = 1.0. Note the default <code class="language-plaintext highlighter-rouge">regParam</code> is 0.01 for <code class="language-plaintext highlighter-rouge">LassoWithSGD</code>, but is 0.0 for <code class="language-plaintext highlighter-rouge">LinearRegression</code>.</li>
<li><code class="language-plaintext highlighter-rouge">org.apache.spark.mllib.regression.LinearRegressionWithSGD</code> which is deprecated in 2.0, is removed in 3.0, use <code class="language-plaintext highlighter-rouge">org.apache.spark.ml.regression.LinearRegression</code> or <code class="language-plaintext highlighter-rouge">LBFGS</code> instead.</li>
<li><code class="language-plaintext highlighter-rouge">org.apache.spark.mllib.clustering.KMeans.getRuns</code> and <code class="language-plaintext highlighter-rouge">setRuns</code> which are deprecated in 2.1, are removed in 3.0, have no effect since Spark 2.0.0.</li>
<li><code class="language-plaintext highlighter-rouge">org.apache.spark.ml.LinearSVCModel.setWeightCol</code> which is deprecated in 2.4, is removed in 3.0, is not intended for users.</li>
<li>From 3.0, <code class="language-plaintext highlighter-rouge">org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel</code> extends <code class="language-plaintext highlighter-rouge">MultilayerPerceptronParams</code> to expose the training params. As a result, <code class="language-plaintext highlighter-rouge">layers</code> in <code class="language-plaintext highlighter-rouge">MultilayerPerceptronClassificationModel</code> has been changed from <code class="language-plaintext highlighter-rouge">Array[Int]</code> to <code class="language-plaintext highlighter-rouge">IntArrayParam</code>. Users should use <code class="language-plaintext highlighter-rouge">MultilayerPerceptronClassificationModel.getLayers</code> instead of <code class="language-plaintext highlighter-rouge">MultilayerPerceptronClassificationModel.layers</code> to retrieve the size of layers.</li>
<li><code class="language-plaintext highlighter-rouge">org.apache.spark.ml.classification.GBTClassifier.numTrees</code> which is deprecated in 2.4.5, is removed in 3.0, use <code class="language-plaintext highlighter-rouge">getNumTrees</code> instead.</li>
<li><code class="language-plaintext highlighter-rouge">org.apache.spark.ml.clustering.KMeansModel.computeCost</code> which is deprecated in 2.4, is removed in 3.0, use <code class="language-plaintext highlighter-rouge">ClusteringEvaluator</code> instead.</li>
<li>The member variable <code class="language-plaintext highlighter-rouge">precision</code> in <code class="language-plaintext highlighter-rouge">org.apache.spark.mllib.evaluation.MulticlassMetrics</code> which is deprecated in 2.0, is removed in 3.0. Use <code class="language-plaintext highlighter-rouge">accuracy</code> instead.</li>
<li>The member variable <code class="language-plaintext highlighter-rouge">recall</code> in <code class="language-plaintext highlighter-rouge">org.apache.spark.mllib.evaluation.MulticlassMetrics</code> which is deprecated in 2.0, is removed in 3.0. Use <code class="language-plaintext highlighter-rouge">accuracy</code> instead.</li>
<li>The member variable <code class="language-plaintext highlighter-rouge">fMeasure</code> in <code class="language-plaintext highlighter-rouge">org.apache.spark.mllib.evaluation.MulticlassMetrics</code> which is deprecated in 2.0, is removed in 3.0. Use <code class="language-plaintext highlighter-rouge">accuracy</code> instead.</li>
<li><code class="language-plaintext highlighter-rouge">org.apache.spark.ml.util.GeneralMLWriter.context</code> which is deprecated in 2.0, is removed in 3.0, use <code class="language-plaintext highlighter-rouge">session</code> instead.</li>
<li><code class="language-plaintext highlighter-rouge">org.apache.spark.ml.util.MLWriter.context</code> which is deprecated in 2.0, is removed in 3.0, use <code class="language-plaintext highlighter-rouge">session</code> instead.</li>
<li><code class="language-plaintext highlighter-rouge">org.apache.spark.ml.util.MLReader.context</code> which is deprecated in 2.0, is removed in 3.0, use <code class="language-plaintext highlighter-rouge">session</code> instead.</li>
<li><code class="language-plaintext highlighter-rouge">abstract class UnaryTransformer[IN, OUT, T &lt;: UnaryTransformer[IN, OUT, T]]</code> is changed to <code class="language-plaintext highlighter-rouge">abstract class UnaryTransformer[IN: TypeTag, OUT: TypeTag, T &lt;: UnaryTransformer[IN, OUT, T]]</code> in 3.0.</li>
</ul>
<h3 class="no_toc" id="deprecations-and-changes-of-behavior">Deprecations and changes of behavior</h3>
<p><strong>Deprecations</strong></p>
<ul>
<li><a href="https://issues.apache.org/jira/browse/SPARK-11215">SPARK-11215</a>:
<code class="language-plaintext highlighter-rouge">labels</code> in <code class="language-plaintext highlighter-rouge">StringIndexerModel</code> is deprecated and will be removed in 3.1.0. Use <code class="language-plaintext highlighter-rouge">labelsArray</code> instead.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-25758">SPARK-25758</a>:
<code class="language-plaintext highlighter-rouge">computeCost</code> in <code class="language-plaintext highlighter-rouge">BisectingKMeansModel</code> is deprecated and will be removed in future versions. Use <code class="language-plaintext highlighter-rouge">ClusteringEvaluator</code> instead.</li>
</ul>
<p><strong>Changes of behavior</strong></p>
<ul>
<li><a href="https://issues.apache.org/jira/browse/SPARK-11215">SPARK-11215</a>:
In Spark 2.4 and previous versions, when specifying <code class="language-plaintext highlighter-rouge">frequencyDesc</code> or <code class="language-plaintext highlighter-rouge">frequencyAsc</code> as
<code class="language-plaintext highlighter-rouge">stringOrderType</code> param in <code class="language-plaintext highlighter-rouge">StringIndexer</code>, in case of equal frequency, the order of
strings is undefined. Since Spark 3.0, the strings with equal frequency are further
sorted by alphabet. And since Spark 3.0, <code class="language-plaintext highlighter-rouge">StringIndexer</code> supports encoding multiple
columns.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-20604">SPARK-20604</a>:
In prior to 3.0 releases, <code class="language-plaintext highlighter-rouge">Imputer</code> requires input column to be Double or Float. In 3.0, this
restriction is lifted so <code class="language-plaintext highlighter-rouge">Imputer</code> can handle all numeric types.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-23469">SPARK-23469</a>:
In Spark 3.0, the <code class="language-plaintext highlighter-rouge">HashingTF</code> Transformer uses a corrected implementation of the murmur3 hash
function to hash elements to vectors. <code class="language-plaintext highlighter-rouge">HashingTF</code> in Spark 3.0 will map elements to
different positions in vectors than in Spark 2. However, <code class="language-plaintext highlighter-rouge">HashingTF</code> created with Spark 2.x
and loaded with Spark 3.0 will still use the previous hash function and will not change behavior.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-28969">SPARK-28969</a>:
The <code class="language-plaintext highlighter-rouge">setClassifier</code> method in PySpark&#8217;s <code class="language-plaintext highlighter-rouge">OneVsRestModel</code> has been removed in 3.0 for parity with
the Scala implementation. Callers should not need to set the classifier in the model after
creation.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-25790">SPARK-25790</a>:
PCA adds the support for more than 65535 column matrix in Spark 3.0.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-28927">SPARK-28927</a>:
When fitting ALS model on nondeterministic input data, previously if rerun happens, users
would see ArrayIndexOutOfBoundsException caused by mismatch between In/Out user/item blocks.
From 3.0, a SparkException with more clear message will be thrown, and original
ArrayIndexOutOfBoundsException is wrapped.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-29232">SPARK-29232</a>:
In prior to 3.0 releases, <code class="language-plaintext highlighter-rouge">RandomForestRegressionModel</code> doesn&#8217;t update the parameter maps
of the DecisionTreeRegressionModels underneath. This is fixed in 3.0.</li>
</ul>
<h2 id="upgrading-from-mllib-22-to-23">Upgrading from MLlib 2.2 to 2.3</h2>
<h3 class="no_toc" id="breaking-changes-1">Breaking changes</h3>
<ul>
<li>The class and trait hierarchy for logistic regression model summaries was changed to be cleaner
and better accommodate the addition of the multi-class summary. This is a breaking change for user
code that casts a <code class="language-plaintext highlighter-rouge">LogisticRegressionTrainingSummary</code> to a
<code class="language-plaintext highlighter-rouge">BinaryLogisticRegressionTrainingSummary</code>. Users should instead use the <code class="language-plaintext highlighter-rouge">model.binarySummary</code>
method. See <a href="https://issues.apache.org/jira/browse/SPARK-17139">SPARK-17139</a> for more detail
(<em>note</em> this is an <code class="language-plaintext highlighter-rouge">Experimental</code> API). This <em>does not</em> affect the Python <code class="language-plaintext highlighter-rouge">summary</code> method, which
will still work correctly for both multinomial and binary cases.</li>
</ul>
<h3 class="no_toc" id="deprecations-and-changes-of-behavior-1">Deprecations and changes of behavior</h3>
<p><strong>Deprecations</strong></p>
<ul>
<li><code class="language-plaintext highlighter-rouge">OneHotEncoder</code> has been deprecated and will be removed in <code class="language-plaintext highlighter-rouge">3.0</code>. It has been replaced by the
new <a href="ml-features.html#onehotencoderestimator"><code class="language-plaintext highlighter-rouge">OneHotEncoderEstimator</code></a>
(see <a href="https://issues.apache.org/jira/browse/SPARK-13030">SPARK-13030</a>). <strong>Note</strong> that
<code class="language-plaintext highlighter-rouge">OneHotEncoderEstimator</code> will be renamed to <code class="language-plaintext highlighter-rouge">OneHotEncoder</code> in <code class="language-plaintext highlighter-rouge">3.0</code> (but
<code class="language-plaintext highlighter-rouge">OneHotEncoderEstimator</code> will be kept as an alias).</li>
</ul>
<p><strong>Changes of behavior</strong></p>
<ul>
<li><a href="https://issues.apache.org/jira/browse/SPARK-21027">SPARK-21027</a>:
The default parallelism used in <code class="language-plaintext highlighter-rouge">OneVsRest</code> is now set to 1 (i.e. serial). In <code class="language-plaintext highlighter-rouge">2.2</code> and
earlier versions, the level of parallelism was set to the default threadpool size in Scala.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-22156">SPARK-22156</a>:
The learning rate update for <code class="language-plaintext highlighter-rouge">Word2Vec</code> was incorrect when <code class="language-plaintext highlighter-rouge">numIterations</code> was set greater than
<code class="language-plaintext highlighter-rouge">1</code>. This will cause training results to be different between <code class="language-plaintext highlighter-rouge">2.3</code> and earlier versions.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-21681">SPARK-21681</a>:
Fixed an edge case bug in multinomial logistic regression that resulted in incorrect coefficients
when some features had zero variance.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-16957">SPARK-16957</a>:
Tree algorithms now use mid-points for split values. This may change results from model training.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-14657">SPARK-14657</a>:
Fixed an issue where the features generated by <code class="language-plaintext highlighter-rouge">RFormula</code> without an intercept were inconsistent
with the output in R. This may change results from model training in this scenario.</li>
</ul>
<h2 id="upgrading-from-mllib-21-to-22">Upgrading from MLlib 2.1 to 2.2</h2>
<h3 class="no_toc" id="breaking-changes-2">Breaking changes</h3>
<p>There are no breaking changes.</p>
<h3 class="no_toc" id="deprecations-and-changes-of-behavior-2">Deprecations and changes of behavior</h3>
<p><strong>Deprecations</strong></p>
<p>There are no deprecations.</p>
<p><strong>Changes of behavior</strong></p>
<ul>
<li><a href="https://issues.apache.org/jira/browse/SPARK-19787">SPARK-19787</a>:
Default value of <code class="language-plaintext highlighter-rouge">regParam</code> changed from <code class="language-plaintext highlighter-rouge">1.0</code> to <code class="language-plaintext highlighter-rouge">0.1</code> for <code class="language-plaintext highlighter-rouge">ALS.train</code> method (marked <code class="language-plaintext highlighter-rouge">DeveloperApi</code>).
<strong>Note</strong> this does <em>not affect</em> the <code class="language-plaintext highlighter-rouge">ALS</code> Estimator or Model, nor MLlib&#8217;s <code class="language-plaintext highlighter-rouge">ALS</code> class.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-14772">SPARK-14772</a>:
Fixed inconsistency between Python and Scala APIs for <code class="language-plaintext highlighter-rouge">Param.copy</code> method.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-11569">SPARK-11569</a>:
<code class="language-plaintext highlighter-rouge">StringIndexer</code> now handles <code class="language-plaintext highlighter-rouge">NULL</code> values in the same way as unseen values. Previously an exception
would always be thrown regardless of the setting of the <code class="language-plaintext highlighter-rouge">handleInvalid</code> parameter.</li>
</ul>
<h2 id="upgrading-from-mllib-20-to-21">Upgrading from MLlib 2.0 to 2.1</h2>
<h3 class="no_toc" id="breaking-changes-3">Breaking changes</h3>
<p><strong>Deprecated methods removed</strong></p>
<ul>
<li><code class="language-plaintext highlighter-rouge">setLabelCol</code> in <code class="language-plaintext highlighter-rouge">feature.ChiSqSelectorModel</code></li>
<li><code class="language-plaintext highlighter-rouge">numTrees</code> in <code class="language-plaintext highlighter-rouge">classification.RandomForestClassificationModel</code> (This now refers to the Param called <code class="language-plaintext highlighter-rouge">numTrees</code>)</li>
<li><code class="language-plaintext highlighter-rouge">numTrees</code> in <code class="language-plaintext highlighter-rouge">regression.RandomForestRegressionModel</code> (This now refers to the Param called <code class="language-plaintext highlighter-rouge">numTrees</code>)</li>
<li><code class="language-plaintext highlighter-rouge">model</code> in <code class="language-plaintext highlighter-rouge">regression.LinearRegressionSummary</code></li>
<li><code class="language-plaintext highlighter-rouge">validateParams</code> in <code class="language-plaintext highlighter-rouge">PipelineStage</code></li>
<li><code class="language-plaintext highlighter-rouge">validateParams</code> in <code class="language-plaintext highlighter-rouge">Evaluator</code></li>
</ul>
<h3 class="no_toc" id="deprecations-and-changes-of-behavior-3">Deprecations and changes of behavior</h3>
<p><strong>Deprecations</strong></p>
<ul>
<li><a href="https://issues.apache.org/jira/browse/SPARK-18592">SPARK-18592</a>:
Deprecate all Param setter methods except for input/output column Params for <code class="language-plaintext highlighter-rouge">DecisionTreeClassificationModel</code>, <code class="language-plaintext highlighter-rouge">GBTClassificationModel</code>, <code class="language-plaintext highlighter-rouge">RandomForestClassificationModel</code>, <code class="language-plaintext highlighter-rouge">DecisionTreeRegressionModel</code>, <code class="language-plaintext highlighter-rouge">GBTRegressionModel</code> and <code class="language-plaintext highlighter-rouge">RandomForestRegressionModel</code></li>
</ul>
<p><strong>Changes of behavior</strong></p>
<ul>
<li><a href="https://issues.apache.org/jira/browse/SPARK-17870">SPARK-17870</a>:
Fix a bug of <code class="language-plaintext highlighter-rouge">ChiSqSelector</code> which will likely change its result. Now <code class="language-plaintext highlighter-rouge">ChiSquareSelector</code> use pValue rather than raw statistic to select a fixed number of top features.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-3261">SPARK-3261</a>:
<code class="language-plaintext highlighter-rouge">KMeans</code> returns potentially fewer than k cluster centers in cases where k distinct centroids aren&#8217;t available or aren&#8217;t selected.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-17389">SPARK-17389</a>:
<code class="language-plaintext highlighter-rouge">KMeans</code> reduces the default number of steps from 5 to 2 for the k-means|| initialization mode.</li>
</ul>
<h2 id="upgrading-from-mllib-16-to-20">Upgrading from MLlib 1.6 to 2.0</h2>
<h3 class="no_toc" id="breaking-changes-4">Breaking changes</h3>
<p>There were several breaking changes in Spark 2.0, which are outlined below.</p>
<p><strong>Linear algebra classes for DataFrame-based APIs</strong></p>
<p>Spark&#8217;s linear algebra dependencies were moved to a new project, <code class="language-plaintext highlighter-rouge">mllib-local</code>
(see <a href="https://issues.apache.org/jira/browse/SPARK-13944">SPARK-13944</a>).
As part of this change, the linear algebra classes were copied to a new package, <code class="language-plaintext highlighter-rouge">spark.ml.linalg</code>.
The DataFrame-based APIs in <code class="language-plaintext highlighter-rouge">spark.ml</code> now depend on the <code class="language-plaintext highlighter-rouge">spark.ml.linalg</code> classes,
leading to a few breaking changes, predominantly in various model classes
(see <a href="https://issues.apache.org/jira/browse/SPARK-14810">SPARK-14810</a> for a full list).</p>
<p><strong>Note:</strong> the RDD-based APIs in <code class="language-plaintext highlighter-rouge">spark.mllib</code> continue to depend on the previous package <code class="language-plaintext highlighter-rouge">spark.mllib.linalg</code>.</p>
<p><em>Converting vectors and matrices</em></p>
<p>While most pipeline components support backward compatibility for loading,
some existing <code class="language-plaintext highlighter-rouge">DataFrames</code> and pipelines in Spark versions prior to 2.0, that contain vector or matrix
columns, may need to be migrated to the new <code class="language-plaintext highlighter-rouge">spark.ml</code> vector and matrix types.
Utilities for converting <code class="language-plaintext highlighter-rouge">DataFrame</code> columns from <code class="language-plaintext highlighter-rouge">spark.mllib.linalg</code> to <code class="language-plaintext highlighter-rouge">spark.ml.linalg</code> types
(and vice versa) can be found in <code class="language-plaintext highlighter-rouge">spark.mllib.util.MLUtils</code>.</p>
<p>There are also utility methods available for converting single instances of
vectors and matrices. Use the <code class="language-plaintext highlighter-rouge">asML</code> method on a <code class="language-plaintext highlighter-rouge">mllib.linalg.Vector</code> / <code class="language-plaintext highlighter-rouge">mllib.linalg.Matrix</code>
for converting to <code class="language-plaintext highlighter-rouge">ml.linalg</code> types, and
<code class="language-plaintext highlighter-rouge">mllib.linalg.Vectors.fromML</code> / <code class="language-plaintext highlighter-rouge">mllib.linalg.Matrices.fromML</code>
for converting to <code class="language-plaintext highlighter-rouge">mllib.linalg</code> types.</p>
<div class="codetabs">
<div data-lang="scala">
<figure class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span>
<span class="c1">// convert DataFrame columns</span>
<span class="k">val</span> <span class="nv">convertedVecDF</span> <span class="k">=</span> <span class="nv">MLUtils</span><span class="o">.</span><span class="py">convertVectorColumnsToML</span><span class="o">(</span><span class="n">vecDF</span><span class="o">)</span>
<span class="k">val</span> <span class="nv">convertedMatrixDF</span> <span class="k">=</span> <span class="nv">MLUtils</span><span class="o">.</span><span class="py">convertMatrixColumnsToML</span><span class="o">(</span><span class="n">matrixDF</span><span class="o">)</span>
<span class="c1">// convert a single vector or matrix</span>
<span class="k">val</span> <span class="nv">mlVec</span><span class="k">:</span> <span class="kt">org.apache.spark.ml.linalg.Vector</span> <span class="o">=</span> <span class="nv">mllibVec</span><span class="o">.</span><span class="py">asML</span>
<span class="k">val</span> <span class="nv">mlMat</span><span class="k">:</span> <span class="kt">org.apache.spark.ml.linalg.Matrix</span> <span class="o">=</span> <span class="nv">mllibMat</span><span class="o">.</span><span class="py">asML</span></code></pre></figure>
<p>Refer to the <a href="api/scala/org/apache/spark/mllib/util/MLUtils$.html"><code class="language-plaintext highlighter-rouge">MLUtils</code> Scala docs</a> for further detail.</p>
</div>
<div data-lang="java">
<figure class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span><span class="o">;</span>
<span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span>
<span class="c1">// convert DataFrame columns</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">convertedVecDF</span> <span class="o">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="na">convertVectorColumnsToML</span><span class="o">(</span><span class="n">vecDF</span><span class="o">);</span>
<span class="nc">Dataset</span><span class="o">&lt;</span><span class="nc">Row</span><span class="o">&gt;</span> <span class="n">convertedMatrixDF</span> <span class="o">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="na">convertMatrixColumnsToML</span><span class="o">(</span><span class="n">matrixDF</span><span class="o">);</span>
<span class="c1">// convert a single vector or matrix</span>
<span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">ml</span><span class="o">.</span><span class="na">linalg</span><span class="o">.</span><span class="na">Vector</span> <span class="n">mlVec</span> <span class="o">=</span> <span class="n">mllibVec</span><span class="o">.</span><span class="na">asML</span><span class="o">();</span>
<span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">ml</span><span class="o">.</span><span class="na">linalg</span><span class="o">.</span><span class="na">Matrix</span> <span class="n">mlMat</span> <span class="o">=</span> <span class="n">mllibMat</span><span class="o">.</span><span class="na">asML</span><span class="o">();</span></code></pre></figure>
<p>Refer to the <a href="api/java/org/apache/spark/mllib/util/MLUtils.html"><code class="language-plaintext highlighter-rouge">MLUtils</code> Java docs</a> for further detail.</p>
</div>
<div data-lang="python">
<figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.mllib.util</span> <span class="kn">import</span> <span class="n">MLUtils</span>
<span class="c1"># convert DataFrame columns
</span><span class="n">convertedVecDF</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="p">.</span><span class="n">convertVectorColumnsToML</span><span class="p">(</span><span class="n">vecDF</span><span class="p">)</span>
<span class="n">convertedMatrixDF</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="p">.</span><span class="n">convertMatrixColumnsToML</span><span class="p">(</span><span class="n">matrixDF</span><span class="p">)</span>
<span class="c1"># convert a single vector or matrix
</span><span class="n">mlVec</span> <span class="o">=</span> <span class="n">mllibVec</span><span class="p">.</span><span class="n">asML</span><span class="p">()</span>
<span class="n">mlMat</span> <span class="o">=</span> <span class="n">mllibMat</span><span class="p">.</span><span class="n">asML</span><span class="p">()</span></code></pre></figure>
<p>Refer to the <a href="api/python/reference/api/pyspark.mllib.util.MLUtils.html#pyspark.mllib.util.MLUtils"><code class="language-plaintext highlighter-rouge">MLUtils</code> Python docs</a> for further detail.</p>
</div>
</div>
<p><strong>Deprecated methods removed</strong></p>
<p>Several deprecated methods were removed in the <code class="language-plaintext highlighter-rouge">spark.mllib</code> and <code class="language-plaintext highlighter-rouge">spark.ml</code> packages:</p>
<ul>
<li><code class="language-plaintext highlighter-rouge">setScoreCol</code> in <code class="language-plaintext highlighter-rouge">ml.evaluation.BinaryClassificationEvaluator</code></li>
<li><code class="language-plaintext highlighter-rouge">weights</code> in <code class="language-plaintext highlighter-rouge">LinearRegression</code> and <code class="language-plaintext highlighter-rouge">LogisticRegression</code> in <code class="language-plaintext highlighter-rouge">spark.ml</code></li>
<li><code class="language-plaintext highlighter-rouge">setMaxNumIterations</code> in <code class="language-plaintext highlighter-rouge">mllib.optimization.LBFGS</code> (marked as <code class="language-plaintext highlighter-rouge">DeveloperApi</code>)</li>
<li><code class="language-plaintext highlighter-rouge">treeReduce</code> and <code class="language-plaintext highlighter-rouge">treeAggregate</code> in <code class="language-plaintext highlighter-rouge">mllib.rdd.RDDFunctions</code> (these functions are available on <code class="language-plaintext highlighter-rouge">RDD</code>s directly, and were marked as <code class="language-plaintext highlighter-rouge">DeveloperApi</code>)</li>
<li><code class="language-plaintext highlighter-rouge">defaultStrategy</code> in <code class="language-plaintext highlighter-rouge">mllib.tree.configuration.Strategy</code></li>
<li><code class="language-plaintext highlighter-rouge">build</code> in <code class="language-plaintext highlighter-rouge">mllib.tree.Node</code></li>
<li>libsvm loaders for multiclass and load/save labeledData methods in <code class="language-plaintext highlighter-rouge">mllib.util.MLUtils</code></li>
</ul>
<p>A full list of breaking changes can be found at <a href="https://issues.apache.org/jira/browse/SPARK-14810">SPARK-14810</a>.</p>
<h3 class="no_toc" id="deprecations-and-changes-of-behavior-4">Deprecations and changes of behavior</h3>
<p><strong>Deprecations</strong></p>
<p>Deprecations in the <code class="language-plaintext highlighter-rouge">spark.mllib</code> and <code class="language-plaintext highlighter-rouge">spark.ml</code> packages include:</p>
<ul>
<li><a href="https://issues.apache.org/jira/browse/SPARK-14984">SPARK-14984</a>:
In <code class="language-plaintext highlighter-rouge">spark.ml.regression.LinearRegressionSummary</code>, the <code class="language-plaintext highlighter-rouge">model</code> field has been deprecated.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-13784">SPARK-13784</a>:
In <code class="language-plaintext highlighter-rouge">spark.ml.regression.RandomForestRegressionModel</code> and <code class="language-plaintext highlighter-rouge">spark.ml.classification.RandomForestClassificationModel</code>,
the <code class="language-plaintext highlighter-rouge">numTrees</code> parameter has been deprecated in favor of <code class="language-plaintext highlighter-rouge">getNumTrees</code> method.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-13761">SPARK-13761</a>:
In <code class="language-plaintext highlighter-rouge">spark.ml.param.Params</code>, the <code class="language-plaintext highlighter-rouge">validateParams</code> method has been deprecated.
We move all functionality in overridden methods to the corresponding <code class="language-plaintext highlighter-rouge">transformSchema</code>.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-14829">SPARK-14829</a>:
In <code class="language-plaintext highlighter-rouge">spark.mllib</code> package, <code class="language-plaintext highlighter-rouge">LinearRegressionWithSGD</code>, <code class="language-plaintext highlighter-rouge">LassoWithSGD</code>, <code class="language-plaintext highlighter-rouge">RidgeRegressionWithSGD</code> and <code class="language-plaintext highlighter-rouge">LogisticRegressionWithSGD</code> have been deprecated.
We encourage users to use <code class="language-plaintext highlighter-rouge">spark.ml.regression.LinearRegression</code> and <code class="language-plaintext highlighter-rouge">spark.ml.classification.LogisticRegression</code>.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-14900">SPARK-14900</a>:
In <code class="language-plaintext highlighter-rouge">spark.mllib.evaluation.MulticlassMetrics</code>, the parameters <code class="language-plaintext highlighter-rouge">precision</code>, <code class="language-plaintext highlighter-rouge">recall</code> and <code class="language-plaintext highlighter-rouge">fMeasure</code> have been deprecated in favor of <code class="language-plaintext highlighter-rouge">accuracy</code>.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-15644">SPARK-15644</a>:
In <code class="language-plaintext highlighter-rouge">spark.ml.util.MLReader</code> and <code class="language-plaintext highlighter-rouge">spark.ml.util.MLWriter</code>, the <code class="language-plaintext highlighter-rouge">context</code> method has been deprecated in favor of <code class="language-plaintext highlighter-rouge">session</code>.</li>
<li>In <code class="language-plaintext highlighter-rouge">spark.ml.feature.ChiSqSelectorModel</code>, the <code class="language-plaintext highlighter-rouge">setLabelCol</code> method has been deprecated since it was not used by <code class="language-plaintext highlighter-rouge">ChiSqSelectorModel</code>.</li>
</ul>
<p><strong>Changes of behavior</strong></p>
<p>Changes of behavior in the <code class="language-plaintext highlighter-rouge">spark.mllib</code> and <code class="language-plaintext highlighter-rouge">spark.ml</code> packages include:</p>
<ul>
<li><a href="https://issues.apache.org/jira/browse/SPARK-7780">SPARK-7780</a>:
<code class="language-plaintext highlighter-rouge">spark.mllib.classification.LogisticRegressionWithLBFGS</code> directly calls <code class="language-plaintext highlighter-rouge">spark.ml.classification.LogisticRegression</code> for binary classification now.
This will introduce the following behavior changes for <code class="language-plaintext highlighter-rouge">spark.mllib.classification.LogisticRegressionWithLBFGS</code>:
<ul>
<li>The intercept will not be regularized when training binary classification model with L1/L2 Updater.</li>
<li>If users set without regularization, training with or without feature scaling will return the same solution by the same convergence rate.</li>
</ul>
</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-13429">SPARK-13429</a>:
In order to provide better and consistent result with <code class="language-plaintext highlighter-rouge">spark.ml.classification.LogisticRegression</code>,
the default value of <code class="language-plaintext highlighter-rouge">spark.mllib.classification.LogisticRegressionWithLBFGS</code>: <code class="language-plaintext highlighter-rouge">convergenceTol</code> has been changed from 1E-4 to 1E-6.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-12363">SPARK-12363</a>:
Fix a bug of <code class="language-plaintext highlighter-rouge">PowerIterationClustering</code> which will likely change its result.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-13048">SPARK-13048</a>:
<code class="language-plaintext highlighter-rouge">LDA</code> using the <code class="language-plaintext highlighter-rouge">EM</code> optimizer will keep the last checkpoint by default, if checkpointing is being used.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-12153">SPARK-12153</a>:
<code class="language-plaintext highlighter-rouge">Word2Vec</code> now respects sentence boundaries. Previously, it did not handle them correctly.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-10574">SPARK-10574</a>:
<code class="language-plaintext highlighter-rouge">HashingTF</code> uses <code class="language-plaintext highlighter-rouge">MurmurHash3</code> as default hash algorithm in both <code class="language-plaintext highlighter-rouge">spark.ml</code> and <code class="language-plaintext highlighter-rouge">spark.mllib</code>.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-14768">SPARK-14768</a>:
The <code class="language-plaintext highlighter-rouge">expectedType</code> argument for PySpark <code class="language-plaintext highlighter-rouge">Param</code> was removed.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-14931">SPARK-14931</a>:
Some default <code class="language-plaintext highlighter-rouge">Param</code> values, which were mismatched between pipelines in Scala and Python, have been changed.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-13600">SPARK-13600</a>:
<code class="language-plaintext highlighter-rouge">QuantileDiscretizer</code> now uses <code class="language-plaintext highlighter-rouge">spark.sql.DataFrameStatFunctions.approxQuantile</code> to find splits (previously used custom sampling logic).
The output buckets will differ for same input data and params.</li>
</ul>
<h2 id="upgrading-from-mllib-15-to-16">Upgrading from MLlib 1.5 to 1.6</h2>
<p>There are no breaking API changes in the <code class="language-plaintext highlighter-rouge">spark.mllib</code> or <code class="language-plaintext highlighter-rouge">spark.ml</code> packages, but there are
deprecations and changes of behavior.</p>
<p>Deprecations:</p>
<ul>
<li><a href="https://issues.apache.org/jira/browse/SPARK-11358">SPARK-11358</a>:
In <code class="language-plaintext highlighter-rouge">spark.mllib.clustering.KMeans</code>, the <code class="language-plaintext highlighter-rouge">runs</code> parameter has been deprecated.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-10592">SPARK-10592</a>:
In <code class="language-plaintext highlighter-rouge">spark.ml.classification.LogisticRegressionModel</code> and
<code class="language-plaintext highlighter-rouge">spark.ml.regression.LinearRegressionModel</code>, the <code class="language-plaintext highlighter-rouge">weights</code> field has been deprecated in favor of
the new name <code class="language-plaintext highlighter-rouge">coefficients</code>. This helps disambiguate from instance (row) &#8220;weights&#8221; given to
algorithms.</li>
</ul>
<p>Changes of behavior:</p>
<ul>
<li><a href="https://issues.apache.org/jira/browse/SPARK-7770">SPARK-7770</a>:
<code class="language-plaintext highlighter-rouge">spark.mllib.tree.GradientBoostedTrees</code>: <code class="language-plaintext highlighter-rouge">validationTol</code> has changed semantics in 1.6.
Previously, it was a threshold for absolute change in error. Now, it resembles the behavior of
<code class="language-plaintext highlighter-rouge">GradientDescent</code>&#8217;s <code class="language-plaintext highlighter-rouge">convergenceTol</code>: For large errors, it uses relative error (relative to the
previous error); for small errors (<code class="language-plaintext highlighter-rouge">&lt; 0.01</code>), it uses absolute error.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-11069">SPARK-11069</a>:
<code class="language-plaintext highlighter-rouge">spark.ml.feature.RegexTokenizer</code>: Previously, it did not convert strings to lowercase before
tokenizing. Now, it converts to lowercase by default, with an option not to. This matches the
behavior of the simpler <code class="language-plaintext highlighter-rouge">Tokenizer</code> transformer.</li>
</ul>
<h2 id="upgrading-from-mllib-14-to-15">Upgrading from MLlib 1.4 to 1.5</h2>
<p>In the <code class="language-plaintext highlighter-rouge">spark.mllib</code> package, there are no breaking API changes but several behavior changes:</p>
<ul>
<li><a href="https://issues.apache.org/jira/browse/SPARK-9005">SPARK-9005</a>:
<code class="language-plaintext highlighter-rouge">RegressionMetrics.explainedVariance</code> returns the average regression sum of squares.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-8600">SPARK-8600</a>: <code class="language-plaintext highlighter-rouge">NaiveBayesModel.labels</code> become
sorted.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-3382">SPARK-3382</a>: <code class="language-plaintext highlighter-rouge">GradientDescent</code> has a default
convergence tolerance <code class="language-plaintext highlighter-rouge">1e-3</code>, and hence iterations might end earlier than 1.4.</li>
</ul>
<p>In the <code class="language-plaintext highlighter-rouge">spark.ml</code> package, there exists one breaking API change and one behavior change:</p>
<ul>
<li><a href="https://issues.apache.org/jira/browse/SPARK-9268">SPARK-9268</a>: Java&#8217;s varargs support is removed
from <code class="language-plaintext highlighter-rouge">Params.setDefault</code> due to a
<a href="https://issues.scala-lang.org/browse/SI-9013">Scala compiler bug</a>.</li>
<li><a href="https://issues.apache.org/jira/browse/SPARK-10097">SPARK-10097</a>: <code class="language-plaintext highlighter-rouge">Evaluator.isLargerBetter</code> is
added to indicate metric ordering. Metrics like RMSE no longer flip signs as in 1.4.</li>
</ul>
<h2 id="upgrading-from-mllib-13-to-14">Upgrading from MLlib 1.3 to 1.4</h2>
<p>In the <code class="language-plaintext highlighter-rouge">spark.mllib</code> package, there were several breaking changes, but all in <code class="language-plaintext highlighter-rouge">DeveloperApi</code> or <code class="language-plaintext highlighter-rouge">Experimental</code> APIs:</p>
<ul>
<li>Gradient-Boosted Trees
<ul>
<li><em>(Breaking change)</em> The signature of the <a href="api/scala/org/apache/spark/mllib/tree/loss/Loss.html"><code class="language-plaintext highlighter-rouge">Loss.gradient</code></a> method was changed. This is only an issues for users who wrote their own losses for GBTs.</li>
<li><em>(Breaking change)</em> The <code class="language-plaintext highlighter-rouge">apply</code> and <code class="language-plaintext highlighter-rouge">copy</code> methods for the case class <a href="api/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.html"><code class="language-plaintext highlighter-rouge">BoostingStrategy</code></a> have been changed because of a modification to the case class fields. This could be an issue for users who use <code class="language-plaintext highlighter-rouge">BoostingStrategy</code> to set GBT parameters.</li>
</ul>
</li>
<li><em>(Breaking change)</em> The return value of <a href="api/scala/org/apache/spark/mllib/clustering/LDA.html"><code class="language-plaintext highlighter-rouge">LDA.run</code></a> has changed. It now returns an abstract class <code class="language-plaintext highlighter-rouge">LDAModel</code> instead of the concrete class <code class="language-plaintext highlighter-rouge">DistributedLDAModel</code>. The object of type <code class="language-plaintext highlighter-rouge">LDAModel</code> can still be cast to the appropriate concrete type, which depends on the optimization algorithm.</li>
</ul>
<p>In the <code class="language-plaintext highlighter-rouge">spark.ml</code> package, several major API changes occurred, including:</p>
<ul>
<li><code class="language-plaintext highlighter-rouge">Param</code> and other APIs for specifying parameters</li>
<li><code class="language-plaintext highlighter-rouge">uid</code> unique IDs for Pipeline components</li>
<li>Reorganization of certain classes</li>
</ul>
<p>Since the <code class="language-plaintext highlighter-rouge">spark.ml</code> API was an alpha component in Spark 1.3, we do not list all changes here.
However, since 1.4 <code class="language-plaintext highlighter-rouge">spark.ml</code> is no longer an alpha component, we will provide details on any API
changes for future releases.</p>
<h2 id="upgrading-from-mllib-12-to-13">Upgrading from MLlib 1.2 to 1.3</h2>
<p>In the <code class="language-plaintext highlighter-rouge">spark.mllib</code> package, there were several breaking changes. The first change (in <code class="language-plaintext highlighter-rouge">ALS</code>) is the only one in a component not marked as Alpha or Experimental.</p>
<ul>
<li><em>(Breaking change)</em> In <a href="api/scala/org/apache/spark/mllib/recommendation/ALS.html"><code class="language-plaintext highlighter-rouge">ALS</code></a>, the extraneous method <code class="language-plaintext highlighter-rouge">solveLeastSquares</code> has been removed. The <code class="language-plaintext highlighter-rouge">DeveloperApi</code> method <code class="language-plaintext highlighter-rouge">analyzeBlocks</code> was also removed.</li>
<li><em>(Breaking change)</em> <a href="api/scala/org/apache/spark/mllib/feature/StandardScalerModel.html"><code class="language-plaintext highlighter-rouge">StandardScalerModel</code></a> remains an Alpha component. In it, the <code class="language-plaintext highlighter-rouge">variance</code> method has been replaced with the <code class="language-plaintext highlighter-rouge">std</code> method. To compute the column variance values returned by the original <code class="language-plaintext highlighter-rouge">variance</code> method, simply square the standard deviation values returned by <code class="language-plaintext highlighter-rouge">std</code>.</li>
<li><em>(Breaking change)</em> <a href="api/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.html"><code class="language-plaintext highlighter-rouge">StreamingLinearRegressionWithSGD</code></a> remains an Experimental component. In it, there were two changes:
<ul>
<li>The constructor taking arguments was removed in favor of a builder pattern using the default constructor plus parameter setter methods.</li>
<li>Variable <code class="language-plaintext highlighter-rouge">model</code> is no longer public.</li>
</ul>
</li>
<li><em>(Breaking change)</em> <a href="api/scala/org/apache/spark/mllib/tree/DecisionTree.html"><code class="language-plaintext highlighter-rouge">DecisionTree</code></a> remains an Experimental component. In it and its associated classes, there were several changes:
<ul>
<li>In <code class="language-plaintext highlighter-rouge">DecisionTree</code>, the deprecated class method <code class="language-plaintext highlighter-rouge">train</code> has been removed. (The object/static <code class="language-plaintext highlighter-rouge">train</code> methods remain.)</li>
<li>In <code class="language-plaintext highlighter-rouge">Strategy</code>, the <code class="language-plaintext highlighter-rouge">checkpointDir</code> parameter has been removed. Checkpointing is still supported, but the checkpoint directory must be set before calling tree and tree ensemble training.</li>
</ul>
</li>
<li><code class="language-plaintext highlighter-rouge">PythonMLlibAPI</code> (the interface between Scala/Java and Python for MLlib) was a public API but is now private, declared <code class="language-plaintext highlighter-rouge">private[python]</code>. This was never meant for external use.</li>
<li>In linear regression (including Lasso and ridge regression), the squared loss is now divided by 2.
So in order to produce the same result as in 1.2, the regularization parameter needs to be divided by 2 and the step size needs to be multiplied by 2.</li>
</ul>
<p>In the <code class="language-plaintext highlighter-rouge">spark.ml</code> package, the main API changes are from Spark SQL. We list the most important changes here:</p>
<ul>
<li>The old <a href="https://spark.apache.org/docs/1.2.1/api/scala/index.html#org.apache.spark.sql.SchemaRDD">SchemaRDD</a> has been replaced with <a href="api/scala/org/apache/spark/sql/DataFrame.html">DataFrame</a> with a somewhat modified API. All algorithms in <code class="language-plaintext highlighter-rouge">spark.ml</code> which used to use SchemaRDD now use DataFrame.</li>
<li>In Spark 1.2, we used implicit conversions from <code class="language-plaintext highlighter-rouge">RDD</code>s of <code class="language-plaintext highlighter-rouge">LabeledPoint</code> into <code class="language-plaintext highlighter-rouge">SchemaRDD</code>s by calling <code class="language-plaintext highlighter-rouge">import sqlContext._</code> where <code class="language-plaintext highlighter-rouge">sqlContext</code> was an instance of <code class="language-plaintext highlighter-rouge">SQLContext</code>. These implicits have been moved, so we now call <code class="language-plaintext highlighter-rouge">import sqlContext.implicits._</code>.</li>
<li>Java APIs for SQL have also changed accordingly. Please see the examples above and the <a href="sql-programming-guide.html">Spark SQL Programming Guide</a> for details.</li>
</ul>
<p>Other changes were in <code class="language-plaintext highlighter-rouge">LogisticRegression</code>:</p>
<ul>
<li>The <code class="language-plaintext highlighter-rouge">scoreCol</code> output column (with default value &#8220;score&#8221;) was renamed to be <code class="language-plaintext highlighter-rouge">probabilityCol</code> (with default value &#8220;probability&#8221;). The type was originally <code class="language-plaintext highlighter-rouge">Double</code> (for the probability of class 1.0), but it is now <code class="language-plaintext highlighter-rouge">Vector</code> (for the probability of each class, to support multiclass classification in the future).</li>
<li>In Spark 1.2, <code class="language-plaintext highlighter-rouge">LogisticRegressionModel</code> did not include an intercept. In Spark 1.3, it includes an intercept; however, it will always be 0.0 since it uses the default settings for <a href="api/scala/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html">spark.mllib.LogisticRegressionWithLBFGS</a>. The option to use an intercept will be added in the future.</li>
</ul>
<h2 id="upgrading-from-mllib-11-to-12">Upgrading from MLlib 1.1 to 1.2</h2>
<p>The only API changes in MLlib v1.2 are in
<a href="api/scala/org/apache/spark/mllib/tree/DecisionTree.html"><code class="language-plaintext highlighter-rouge">DecisionTree</code></a>,
which continues to be an experimental API in MLlib 1.2:</p>
<ol>
<li>
<p><em>(Breaking change)</em> The Scala API for classification takes a named argument specifying the number
of classes. In MLlib v1.1, this argument was called <code class="language-plaintext highlighter-rouge">numClasses</code> in Python and
<code class="language-plaintext highlighter-rouge">numClassesForClassification</code> in Scala. In MLlib v1.2, the names are both set to <code class="language-plaintext highlighter-rouge">numClasses</code>.
This <code class="language-plaintext highlighter-rouge">numClasses</code> parameter is specified either via
<a href="api/scala/org/apache/spark/mllib/tree/configuration/Strategy.html"><code class="language-plaintext highlighter-rouge">Strategy</code></a>
or via <a href="api/scala/org/apache/spark/mllib/tree/DecisionTree.html"><code class="language-plaintext highlighter-rouge">DecisionTree</code></a>
static <code class="language-plaintext highlighter-rouge">trainClassifier</code> and <code class="language-plaintext highlighter-rouge">trainRegressor</code> methods.</p>
</li>
<li>
<p><em>(Breaking change)</em> The API for
<a href="api/scala/org/apache/spark/mllib/tree/model/Node.html"><code class="language-plaintext highlighter-rouge">Node</code></a> has changed.
This should generally not affect user code, unless the user manually constructs decision trees
(instead of using the <code class="language-plaintext highlighter-rouge">trainClassifier</code> or <code class="language-plaintext highlighter-rouge">trainRegressor</code> methods).
The tree <code class="language-plaintext highlighter-rouge">Node</code> now includes more information, including the probability of the predicted label
(for classification).</p>
</li>
<li>
<p>Printing methods&#8217; output has changed. The <code class="language-plaintext highlighter-rouge">toString</code> (Scala/Java) and <code class="language-plaintext highlighter-rouge">__repr__</code> (Python) methods used to print the full model; they now print a summary. For the full model, use <code class="language-plaintext highlighter-rouge">toDebugString</code>.</p>
</li>
</ol>
<p>Examples in the Spark distribution and examples in the
<a href="mllib-decision-tree.html#examples">Decision Trees Guide</a> have been updated accordingly.</p>
<h2 id="upgrading-from-mllib-10-to-11">Upgrading from MLlib 1.0 to 1.1</h2>
<p>The only API changes in MLlib v1.1 are in
<a href="api/scala/org/apache/spark/mllib/tree/DecisionTree.html"><code class="language-plaintext highlighter-rouge">DecisionTree</code></a>,
which continues to be an experimental API in MLlib 1.1:</p>
<ol>
<li>
<p><em>(Breaking change)</em> The meaning of tree depth has been changed by 1 in order to match
the implementations of trees in
<a href="http://scikit-learn.org/stable/modules/classes.html#module-sklearn.tree">scikit-learn</a>
and in <a href="http://cran.r-project.org/web/packages/rpart/index.html">rpart</a>.
In MLlib v1.0, a depth-1 tree had 1 leaf node, and a depth-2 tree had 1 root node and 2 leaf nodes.
In MLlib v1.1, a depth-0 tree has 1 leaf node, and a depth-1 tree has 1 root node and 2 leaf nodes.
This depth is specified by the <code class="language-plaintext highlighter-rouge">maxDepth</code> parameter in
<a href="api/scala/org/apache/spark/mllib/tree/configuration/Strategy.html"><code class="language-plaintext highlighter-rouge">Strategy</code></a>
or via <a href="api/scala/org/apache/spark/mllib/tree/DecisionTree.html"><code class="language-plaintext highlighter-rouge">DecisionTree</code></a>
static <code class="language-plaintext highlighter-rouge">trainClassifier</code> and <code class="language-plaintext highlighter-rouge">trainRegressor</code> methods.</p>
</li>
<li>
<p><em>(Non-breaking change)</em> We recommend using the newly added <code class="language-plaintext highlighter-rouge">trainClassifier</code> and <code class="language-plaintext highlighter-rouge">trainRegressor</code>
methods to build a <a href="api/scala/org/apache/spark/mllib/tree/DecisionTree.html"><code class="language-plaintext highlighter-rouge">DecisionTree</code></a>,
rather than using the old parameter class <code class="language-plaintext highlighter-rouge">Strategy</code>. These new training methods explicitly
separate classification and regression, and they replace specialized parameter types with
simple <code class="language-plaintext highlighter-rouge">String</code> types.</p>
</li>
</ol>
<p>Examples of the new recommended <code class="language-plaintext highlighter-rouge">trainClassifier</code> and <code class="language-plaintext highlighter-rouge">trainRegressor</code> are given in the
<a href="mllib-decision-tree.html#examples">Decision Trees Guide</a>.</p>
<h2 id="upgrading-from-mllib-09-to-10">Upgrading from MLlib 0.9 to 1.0</h2>
<p>In MLlib v1.0, we support both dense and sparse input in a unified way, which introduces a few
breaking changes. If your data is sparse, please store it in a sparse format instead of dense to
take advantage of sparsity in both storage and computation. Details are described below.</p>
</div>
<!-- /container -->
</div>
<script src="js/vendor/jquery-3.5.1.min.js"></script>
<script src="js/vendor/bootstrap.bundle.min.js"></script>
<script src="js/vendor/anchor.min.js"></script>
<script src="js/main.js"></script>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.js"></script>
<script type="text/javascript">
// DocSearch is entirely free and automated. DocSearch is built in two parts:
// 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link
// in your website and extract content from every page it traverses. It then pushes this
// content to an Algolia index.
// 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index
// to your search input and display its results in a dropdown UI. If you want to find more
// details on how works DocSearch, check the docs of DocSearch.
docsearch({
apiKey: 'd62f962a82bc9abb53471cb7b89da35e',
appId: 'RAI69RXRSK',
indexName: 'apache_spark',
inputSelector: '#docsearch-input',
enhancedSearchInput: true,
algoliaOptions: {
'facetFilters': ["version:3.3.4"]
},
debug: false // Set debug to true if you want to inspect the dropdown
});
</script>
<!-- MathJax Section -->
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
TeX: { equationNumbers: { autoNumber: "AMS" } }
});
</script>
<script>
// Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
// We could use "//cdn.mathjax...", but that won't support "file://".
(function(d, script) {
script = d.createElement('script');
script.type = 'text/javascript';
script.async = true;
script.onload = function(){
MathJax.Hub.Config({
tex2jax: {
inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
processEscapes: true,
skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
}
});
};
script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' +
'?config=TeX-AMS-MML_HTMLorMML';
d.getElementsByTagName('head')[0].appendChild(script);
}(document));
</script>
</body>
</html>