| |
| <!DOCTYPE html> |
| <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]--> |
| <!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]--> |
| <!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]--> |
| <!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]--> |
| <head> |
| <meta charset="utf-8"> |
| <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> |
| <title>Migration Guide: MLlib (Machine Learning) - Spark 3.3.4 Documentation</title> |
| |
| |
| |
| |
| <link rel="stylesheet" href="css/bootstrap.min.css"> |
| <style> |
| body { |
| padding-top: 60px; |
| padding-bottom: 40px; |
| } |
| </style> |
| <meta name="viewport" content="width=device-width"> |
| <link rel="stylesheet" href="css/main.css"> |
| |
| <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script> |
| |
| <link rel="stylesheet" href="css/pygments-default.css"> |
| <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.css" /> |
| <link rel="stylesheet" href="css/docsearch.css"> |
| |
| |
| <!-- Matomo --> |
| <script> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| |
| </head> |
| <body> |
| <!--[if lt IE 7]> |
| <p class="chromeframe">You are using an outdated browser. <a href="https://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p> |
| <![endif]--> |
| |
| <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html --> |
| |
| <nav class="navbar fixed-top navbar-expand-md navbar-light bg-light" id="topbar"> |
| <div class="container"> |
| <div class="navbar-header"> |
| <div class="navbar-brand"><a href="index.html"> |
| <img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">3.3.4</span> |
| </div> |
| </div> |
| <button class="navbar-toggler" type="button" data-toggle="collapse" |
| data-target="#navbarCollapse" aria-controls="navbarCollapse" |
| aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| <div class="collapse navbar-collapse" id="navbarCollapse"> |
| <ul class="navbar-nav"> |
| <!--TODO(andyk): Add class="active" attribute to li some how.--> |
| <li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a> |
| <div class="dropdown-menu" aria-labelledby="navbarQuickStart"> |
| <a class="dropdown-item" href="quick-start.html">Quick Start</a> |
| <a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a> |
| <a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a> |
| <a class="dropdown-item" href="structured-streaming-programming-guide.html">Structured Streaming</a> |
| <a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a> |
| <a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a> |
| <a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a> |
| <a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a> |
| <a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a> |
| <div class="dropdown-menu" aria-labelledby="navbarAPIDocs"> |
| <a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a> |
| <a class="dropdown-item" href="api/java/index.html">Java</a> |
| <a class="dropdown-item" href="api/python/index.html">Python</a> |
| <a class="dropdown-item" href="api/R/index.html">R</a> |
| <a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a> |
| <div class="dropdown-menu" aria-labelledby="navbarDeploying"> |
| <a class="dropdown-item" href="cluster-overview.html">Overview</a> |
| <a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a> |
| <div class="dropdown-divider"></div> |
| <a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a> |
| <a class="dropdown-item" href="running-on-mesos.html">Mesos</a> |
| <a class="dropdown-item" href="running-on-yarn.html">YARN</a> |
| <a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a> |
| <div class="dropdown-menu" aria-labelledby="navbarMore"> |
| <a class="dropdown-item" href="configuration.html">Configuration</a> |
| <a class="dropdown-item" href="monitoring.html">Monitoring</a> |
| <a class="dropdown-item" href="tuning.html">Tuning Guide</a> |
| <a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a> |
| <a class="dropdown-item" href="security.html">Security</a> |
| <a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a> |
| <a class="dropdown-item" href="migration-guide.html">Migration Guide</a> |
| <div class="dropdown-divider"></div> |
| <a class="dropdown-item" href="building-spark.html">Building Spark</a> |
| <a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a> |
| <a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a> |
| </div> |
| </li> |
| |
| <li class="nav-item"> |
| <input type="text" id="docsearch-input" placeholder="Search the docs…"> |
| </li> |
| </ul> |
| <!--<span class="navbar-text navbar-right"><span class="version-text">v3.3.4</span></span>--> |
| </div> |
| </div> |
| </nav> |
| |
| <div class="container-wrapper"> |
| |
| |
| |
| <div class="left-menu-wrapper"> |
| <div class="left-menu"> |
| <h3><a href="ml-guide.html">MLlib: Main Guide</a></h3> |
| |
| <ul> |
| |
| <li> |
| <a href="ml-statistics.html"> |
| |
| Basic statistics |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="ml-datasource.html"> |
| |
| Data sources |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="ml-pipeline.html"> |
| |
| Pipelines |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="ml-features.html"> |
| |
| Extracting, transforming and selecting features |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="ml-classification-regression.html"> |
| |
| Classification and Regression |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="ml-clustering.html"> |
| |
| Clustering |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="ml-collaborative-filtering.html"> |
| |
| Collaborative filtering |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="ml-frequent-pattern-mining.html"> |
| |
| Frequent Pattern Mining |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="ml-tuning.html"> |
| |
| Model selection and tuning |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="ml-advanced.html"> |
| |
| Advanced topics |
| |
| </a> |
| </li> |
| |
| |
| |
| </ul> |
| |
| <h3><a href="mllib-guide.html">MLlib: RDD-based API Guide</a></h3> |
| |
| <ul> |
| |
| <li> |
| <a href="mllib-data-types.html"> |
| |
| Data types |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="mllib-statistics.html"> |
| |
| Basic statistics |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="mllib-classification-regression.html"> |
| |
| Classification and regression |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="mllib-collaborative-filtering.html"> |
| |
| Collaborative filtering |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="mllib-clustering.html"> |
| |
| Clustering |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="mllib-dimensionality-reduction.html"> |
| |
| Dimensionality reduction |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="mllib-feature-extraction.html"> |
| |
| Feature extraction and transformation |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="mllib-frequent-pattern-mining.html"> |
| |
| Frequent pattern mining |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="mllib-evaluation-metrics.html"> |
| |
| Evaluation metrics |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="mllib-pmml-model-export.html"> |
| |
| PMML model export |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="mllib-optimization.html"> |
| |
| Optimization (developer) |
| |
| </a> |
| </li> |
| |
| |
| |
| </ul> |
| |
| </div> |
| </div> |
| |
| <input id="nav-trigger" class="nav-trigger" checked type="checkbox"> |
| <label for="nav-trigger"></label> |
| <div class="content-with-sidebar mr-3" id="content"> |
| |
| <h1 class="title">Migration Guide: MLlib (Machine Learning)</h1> |
| |
| |
| <ul id="markdown-toc"> |
| <li><a href="#upgrading-from-mllib-24-to-30" id="markdown-toc-upgrading-from-mllib-24-to-30">Upgrading from MLlib 2.4 to 3.0</a></li> |
| <li><a href="#upgrading-from-mllib-22-to-23" id="markdown-toc-upgrading-from-mllib-22-to-23">Upgrading from MLlib 2.2 to 2.3</a></li> |
| <li><a href="#upgrading-from-mllib-21-to-22" id="markdown-toc-upgrading-from-mllib-21-to-22">Upgrading from MLlib 2.1 to 2.2</a></li> |
| <li><a href="#upgrading-from-mllib-20-to-21" id="markdown-toc-upgrading-from-mllib-20-to-21">Upgrading from MLlib 2.0 to 2.1</a></li> |
| <li><a href="#upgrading-from-mllib-16-to-20" id="markdown-toc-upgrading-from-mllib-16-to-20">Upgrading from MLlib 1.6 to 2.0</a></li> |
| <li><a href="#upgrading-from-mllib-15-to-16" id="markdown-toc-upgrading-from-mllib-15-to-16">Upgrading from MLlib 1.5 to 1.6</a></li> |
| <li><a href="#upgrading-from-mllib-14-to-15" id="markdown-toc-upgrading-from-mllib-14-to-15">Upgrading from MLlib 1.4 to 1.5</a></li> |
| <li><a href="#upgrading-from-mllib-13-to-14" id="markdown-toc-upgrading-from-mllib-13-to-14">Upgrading from MLlib 1.3 to 1.4</a></li> |
| <li><a href="#upgrading-from-mllib-12-to-13" id="markdown-toc-upgrading-from-mllib-12-to-13">Upgrading from MLlib 1.2 to 1.3</a></li> |
| <li><a href="#upgrading-from-mllib-11-to-12" id="markdown-toc-upgrading-from-mllib-11-to-12">Upgrading from MLlib 1.1 to 1.2</a></li> |
| <li><a href="#upgrading-from-mllib-10-to-11" id="markdown-toc-upgrading-from-mllib-10-to-11">Upgrading from MLlib 1.0 to 1.1</a></li> |
| <li><a href="#upgrading-from-mllib-09-to-10" id="markdown-toc-upgrading-from-mllib-09-to-10">Upgrading from MLlib 0.9 to 1.0</a></li> |
| </ul> |
| |
| <p>Note that this migration guide describes the items specific to MLlib. |
| Many items of SQL migration can be applied when migrating MLlib to higher versions for DataFrame-based APIs. |
| Please refer <a href="sql-migration-guide.html">Migration Guide: SQL, Datasets and DataFrame</a>.</p> |
| |
| <h2 id="upgrading-from-mllib-24-to-30">Upgrading from MLlib 2.4 to 3.0</h2> |
| |
| <h3 class="no_toc" id="breaking-changes">Breaking changes</h3> |
| |
| <ul> |
| <li><code class="language-plaintext highlighter-rouge">OneHotEncoder</code> which is deprecated in 2.3, is removed in 3.0 and <code class="language-plaintext highlighter-rouge">OneHotEncoderEstimator</code> is now renamed to <code class="language-plaintext highlighter-rouge">OneHotEncoder</code>.</li> |
| <li><code class="language-plaintext highlighter-rouge">org.apache.spark.ml.image.ImageSchema.readImages</code> which is deprecated in 2.3, is removed in 3.0, use <code class="language-plaintext highlighter-rouge">spark.read.format('image')</code> instead.</li> |
| <li><code class="language-plaintext highlighter-rouge">org.apache.spark.mllib.clustering.KMeans.train</code> with param Int <code class="language-plaintext highlighter-rouge">runs</code> which is deprecated in 2.1, is removed in 3.0. Use <code class="language-plaintext highlighter-rouge">train</code> method without <code class="language-plaintext highlighter-rouge">runs</code> instead.</li> |
| <li><code class="language-plaintext highlighter-rouge">org.apache.spark.mllib.classification.LogisticRegressionWithSGD</code> which is deprecated in 2.0, is removed in 3.0, use <code class="language-plaintext highlighter-rouge">org.apache.spark.ml.classification.LogisticRegression</code> or <code class="language-plaintext highlighter-rouge">spark.mllib.classification.LogisticRegressionWithLBFGS</code> instead.</li> |
| <li><code class="language-plaintext highlighter-rouge">org.apache.spark.mllib.feature.ChiSqSelectorModel.isSorted </code> which is deprecated in 2.1, is removed in 3.0, is not intended for subclasses to use.</li> |
| <li><code class="language-plaintext highlighter-rouge">org.apache.spark.mllib.regression.RidgeRegressionWithSGD</code> which is deprecated in 2.0, is removed in 3.0, use <code class="language-plaintext highlighter-rouge">org.apache.spark.ml.regression.LinearRegression</code> with <code class="language-plaintext highlighter-rouge">elasticNetParam</code> = 0.0. Note the default <code class="language-plaintext highlighter-rouge">regParam</code> is 0.01 for <code class="language-plaintext highlighter-rouge">RidgeRegressionWithSGD</code>, but is 0.0 for <code class="language-plaintext highlighter-rouge">LinearRegression</code>.</li> |
| <li><code class="language-plaintext highlighter-rouge">org.apache.spark.mllib.regression.LassoWithSGD</code> which is deprecated in 2.0, is removed in 3.0, use <code class="language-plaintext highlighter-rouge">org.apache.spark.ml.regression.LinearRegression</code> with <code class="language-plaintext highlighter-rouge">elasticNetParam</code> = 1.0. Note the default <code class="language-plaintext highlighter-rouge">regParam</code> is 0.01 for <code class="language-plaintext highlighter-rouge">LassoWithSGD</code>, but is 0.0 for <code class="language-plaintext highlighter-rouge">LinearRegression</code>.</li> |
| <li><code class="language-plaintext highlighter-rouge">org.apache.spark.mllib.regression.LinearRegressionWithSGD</code> which is deprecated in 2.0, is removed in 3.0, use <code class="language-plaintext highlighter-rouge">org.apache.spark.ml.regression.LinearRegression</code> or <code class="language-plaintext highlighter-rouge">LBFGS</code> instead.</li> |
| <li><code class="language-plaintext highlighter-rouge">org.apache.spark.mllib.clustering.KMeans.getRuns</code> and <code class="language-plaintext highlighter-rouge">setRuns</code> which are deprecated in 2.1, are removed in 3.0, have no effect since Spark 2.0.0.</li> |
| <li><code class="language-plaintext highlighter-rouge">org.apache.spark.ml.LinearSVCModel.setWeightCol</code> which is deprecated in 2.4, is removed in 3.0, is not intended for users.</li> |
| <li>From 3.0, <code class="language-plaintext highlighter-rouge">org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel</code> extends <code class="language-plaintext highlighter-rouge">MultilayerPerceptronParams</code> to expose the training params. As a result, <code class="language-plaintext highlighter-rouge">layers</code> in <code class="language-plaintext highlighter-rouge">MultilayerPerceptronClassificationModel</code> has been changed from <code class="language-plaintext highlighter-rouge">Array[Int]</code> to <code class="language-plaintext highlighter-rouge">IntArrayParam</code>. Users should use <code class="language-plaintext highlighter-rouge">MultilayerPerceptronClassificationModel.getLayers</code> instead of <code class="language-plaintext highlighter-rouge">MultilayerPerceptronClassificationModel.layers</code> to retrieve the size of layers.</li> |
| <li><code class="language-plaintext highlighter-rouge">org.apache.spark.ml.classification.GBTClassifier.numTrees</code> which is deprecated in 2.4.5, is removed in 3.0, use <code class="language-plaintext highlighter-rouge">getNumTrees</code> instead.</li> |
| <li><code class="language-plaintext highlighter-rouge">org.apache.spark.ml.clustering.KMeansModel.computeCost</code> which is deprecated in 2.4, is removed in 3.0, use <code class="language-plaintext highlighter-rouge">ClusteringEvaluator</code> instead.</li> |
| <li>The member variable <code class="language-plaintext highlighter-rouge">precision</code> in <code class="language-plaintext highlighter-rouge">org.apache.spark.mllib.evaluation.MulticlassMetrics</code> which is deprecated in 2.0, is removed in 3.0. Use <code class="language-plaintext highlighter-rouge">accuracy</code> instead.</li> |
| <li>The member variable <code class="language-plaintext highlighter-rouge">recall</code> in <code class="language-plaintext highlighter-rouge">org.apache.spark.mllib.evaluation.MulticlassMetrics</code> which is deprecated in 2.0, is removed in 3.0. Use <code class="language-plaintext highlighter-rouge">accuracy</code> instead.</li> |
| <li>The member variable <code class="language-plaintext highlighter-rouge">fMeasure</code> in <code class="language-plaintext highlighter-rouge">org.apache.spark.mllib.evaluation.MulticlassMetrics</code> which is deprecated in 2.0, is removed in 3.0. Use <code class="language-plaintext highlighter-rouge">accuracy</code> instead.</li> |
| <li><code class="language-plaintext highlighter-rouge">org.apache.spark.ml.util.GeneralMLWriter.context</code> which is deprecated in 2.0, is removed in 3.0, use <code class="language-plaintext highlighter-rouge">session</code> instead.</li> |
| <li><code class="language-plaintext highlighter-rouge">org.apache.spark.ml.util.MLWriter.context</code> which is deprecated in 2.0, is removed in 3.0, use <code class="language-plaintext highlighter-rouge">session</code> instead.</li> |
| <li><code class="language-plaintext highlighter-rouge">org.apache.spark.ml.util.MLReader.context</code> which is deprecated in 2.0, is removed in 3.0, use <code class="language-plaintext highlighter-rouge">session</code> instead.</li> |
| <li><code class="language-plaintext highlighter-rouge">abstract class UnaryTransformer[IN, OUT, T <: UnaryTransformer[IN, OUT, T]]</code> is changed to <code class="language-plaintext highlighter-rouge">abstract class UnaryTransformer[IN: TypeTag, OUT: TypeTag, T <: UnaryTransformer[IN, OUT, T]]</code> in 3.0.</li> |
| </ul> |
| |
| <h3 class="no_toc" id="deprecations-and-changes-of-behavior">Deprecations and changes of behavior</h3> |
| |
| <p><strong>Deprecations</strong></p> |
| |
| <ul> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-11215">SPARK-11215</a>: |
| <code class="language-plaintext highlighter-rouge">labels</code> in <code class="language-plaintext highlighter-rouge">StringIndexerModel</code> is deprecated and will be removed in 3.1.0. Use <code class="language-plaintext highlighter-rouge">labelsArray</code> instead.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-25758">SPARK-25758</a>: |
| <code class="language-plaintext highlighter-rouge">computeCost</code> in <code class="language-plaintext highlighter-rouge">BisectingKMeansModel</code> is deprecated and will be removed in future versions. Use <code class="language-plaintext highlighter-rouge">ClusteringEvaluator</code> instead.</li> |
| </ul> |
| |
| <p><strong>Changes of behavior</strong></p> |
| |
| <ul> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-11215">SPARK-11215</a>: |
| In Spark 2.4 and previous versions, when specifying <code class="language-plaintext highlighter-rouge">frequencyDesc</code> or <code class="language-plaintext highlighter-rouge">frequencyAsc</code> as |
| <code class="language-plaintext highlighter-rouge">stringOrderType</code> param in <code class="language-plaintext highlighter-rouge">StringIndexer</code>, in case of equal frequency, the order of |
| strings is undefined. Since Spark 3.0, the strings with equal frequency are further |
| sorted by alphabet. And since Spark 3.0, <code class="language-plaintext highlighter-rouge">StringIndexer</code> supports encoding multiple |
| columns.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-20604">SPARK-20604</a>: |
| In prior to 3.0 releases, <code class="language-plaintext highlighter-rouge">Imputer</code> requires input column to be Double or Float. In 3.0, this |
| restriction is lifted so <code class="language-plaintext highlighter-rouge">Imputer</code> can handle all numeric types.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-23469">SPARK-23469</a>: |
| In Spark 3.0, the <code class="language-plaintext highlighter-rouge">HashingTF</code> Transformer uses a corrected implementation of the murmur3 hash |
| function to hash elements to vectors. <code class="language-plaintext highlighter-rouge">HashingTF</code> in Spark 3.0 will map elements to |
| different positions in vectors than in Spark 2. However, <code class="language-plaintext highlighter-rouge">HashingTF</code> created with Spark 2.x |
| and loaded with Spark 3.0 will still use the previous hash function and will not change behavior.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-28969">SPARK-28969</a>: |
| The <code class="language-plaintext highlighter-rouge">setClassifier</code> method in PySpark’s <code class="language-plaintext highlighter-rouge">OneVsRestModel</code> has been removed in 3.0 for parity with |
| the Scala implementation. Callers should not need to set the classifier in the model after |
| creation.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-25790">SPARK-25790</a>: |
| PCA adds the support for more than 65535 column matrix in Spark 3.0.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-28927">SPARK-28927</a>: |
| When fitting ALS model on nondeterministic input data, previously if rerun happens, users |
| would see ArrayIndexOutOfBoundsException caused by mismatch between In/Out user/item blocks. |
| From 3.0, a SparkException with more clear message will be thrown, and original |
| ArrayIndexOutOfBoundsException is wrapped.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-29232">SPARK-29232</a>: |
| In prior to 3.0 releases, <code class="language-plaintext highlighter-rouge">RandomForestRegressionModel</code> doesn’t update the parameter maps |
| of the DecisionTreeRegressionModels underneath. This is fixed in 3.0.</li> |
| </ul> |
| |
| <h2 id="upgrading-from-mllib-22-to-23">Upgrading from MLlib 2.2 to 2.3</h2> |
| |
| <h3 class="no_toc" id="breaking-changes-1">Breaking changes</h3> |
| |
| <ul> |
| <li>The class and trait hierarchy for logistic regression model summaries was changed to be cleaner |
| and better accommodate the addition of the multi-class summary. This is a breaking change for user |
| code that casts a <code class="language-plaintext highlighter-rouge">LogisticRegressionTrainingSummary</code> to a |
| <code class="language-plaintext highlighter-rouge">BinaryLogisticRegressionTrainingSummary</code>. Users should instead use the <code class="language-plaintext highlighter-rouge">model.binarySummary</code> |
| method. See <a href="https://issues.apache.org/jira/browse/SPARK-17139">SPARK-17139</a> for more detail |
| (<em>note</em> this is an <code class="language-plaintext highlighter-rouge">Experimental</code> API). This <em>does not</em> affect the Python <code class="language-plaintext highlighter-rouge">summary</code> method, which |
| will still work correctly for both multinomial and binary cases.</li> |
| </ul> |
| |
| <h3 class="no_toc" id="deprecations-and-changes-of-behavior-1">Deprecations and changes of behavior</h3> |
| |
| <p><strong>Deprecations</strong></p> |
| |
| <ul> |
| <li><code class="language-plaintext highlighter-rouge">OneHotEncoder</code> has been deprecated and will be removed in <code class="language-plaintext highlighter-rouge">3.0</code>. It has been replaced by the |
| new <a href="ml-features.html#onehotencoderestimator"><code class="language-plaintext highlighter-rouge">OneHotEncoderEstimator</code></a> |
| (see <a href="https://issues.apache.org/jira/browse/SPARK-13030">SPARK-13030</a>). <strong>Note</strong> that |
| <code class="language-plaintext highlighter-rouge">OneHotEncoderEstimator</code> will be renamed to <code class="language-plaintext highlighter-rouge">OneHotEncoder</code> in <code class="language-plaintext highlighter-rouge">3.0</code> (but |
| <code class="language-plaintext highlighter-rouge">OneHotEncoderEstimator</code> will be kept as an alias).</li> |
| </ul> |
| |
| <p><strong>Changes of behavior</strong></p> |
| |
| <ul> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-21027">SPARK-21027</a>: |
| The default parallelism used in <code class="language-plaintext highlighter-rouge">OneVsRest</code> is now set to 1 (i.e. serial). In <code class="language-plaintext highlighter-rouge">2.2</code> and |
| earlier versions, the level of parallelism was set to the default threadpool size in Scala.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-22156">SPARK-22156</a>: |
| The learning rate update for <code class="language-plaintext highlighter-rouge">Word2Vec</code> was incorrect when <code class="language-plaintext highlighter-rouge">numIterations</code> was set greater than |
| <code class="language-plaintext highlighter-rouge">1</code>. This will cause training results to be different between <code class="language-plaintext highlighter-rouge">2.3</code> and earlier versions.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-21681">SPARK-21681</a>: |
| Fixed an edge case bug in multinomial logistic regression that resulted in incorrect coefficients |
| when some features had zero variance.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-16957">SPARK-16957</a>: |
| Tree algorithms now use mid-points for split values. This may change results from model training.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-14657">SPARK-14657</a>: |
| Fixed an issue where the features generated by <code class="language-plaintext highlighter-rouge">RFormula</code> without an intercept were inconsistent |
| with the output in R. This may change results from model training in this scenario.</li> |
| </ul> |
| |
| <h2 id="upgrading-from-mllib-21-to-22">Upgrading from MLlib 2.1 to 2.2</h2> |
| |
| <h3 class="no_toc" id="breaking-changes-2">Breaking changes</h3> |
| |
| <p>There are no breaking changes.</p> |
| |
| <h3 class="no_toc" id="deprecations-and-changes-of-behavior-2">Deprecations and changes of behavior</h3> |
| |
| <p><strong>Deprecations</strong></p> |
| |
| <p>There are no deprecations.</p> |
| |
| <p><strong>Changes of behavior</strong></p> |
| |
| <ul> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-19787">SPARK-19787</a>: |
| Default value of <code class="language-plaintext highlighter-rouge">regParam</code> changed from <code class="language-plaintext highlighter-rouge">1.0</code> to <code class="language-plaintext highlighter-rouge">0.1</code> for <code class="language-plaintext highlighter-rouge">ALS.train</code> method (marked <code class="language-plaintext highlighter-rouge">DeveloperApi</code>). |
| <strong>Note</strong> this does <em>not affect</em> the <code class="language-plaintext highlighter-rouge">ALS</code> Estimator or Model, nor MLlib’s <code class="language-plaintext highlighter-rouge">ALS</code> class.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-14772">SPARK-14772</a>: |
| Fixed inconsistency between Python and Scala APIs for <code class="language-plaintext highlighter-rouge">Param.copy</code> method.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-11569">SPARK-11569</a>: |
| <code class="language-plaintext highlighter-rouge">StringIndexer</code> now handles <code class="language-plaintext highlighter-rouge">NULL</code> values in the same way as unseen values. Previously an exception |
| would always be thrown regardless of the setting of the <code class="language-plaintext highlighter-rouge">handleInvalid</code> parameter.</li> |
| </ul> |
| |
| <h2 id="upgrading-from-mllib-20-to-21">Upgrading from MLlib 2.0 to 2.1</h2> |
| |
| <h3 class="no_toc" id="breaking-changes-3">Breaking changes</h3> |
| |
| <p><strong>Deprecated methods removed</strong></p> |
| |
| <ul> |
| <li><code class="language-plaintext highlighter-rouge">setLabelCol</code> in <code class="language-plaintext highlighter-rouge">feature.ChiSqSelectorModel</code></li> |
| <li><code class="language-plaintext highlighter-rouge">numTrees</code> in <code class="language-plaintext highlighter-rouge">classification.RandomForestClassificationModel</code> (This now refers to the Param called <code class="language-plaintext highlighter-rouge">numTrees</code>)</li> |
| <li><code class="language-plaintext highlighter-rouge">numTrees</code> in <code class="language-plaintext highlighter-rouge">regression.RandomForestRegressionModel</code> (This now refers to the Param called <code class="language-plaintext highlighter-rouge">numTrees</code>)</li> |
| <li><code class="language-plaintext highlighter-rouge">model</code> in <code class="language-plaintext highlighter-rouge">regression.LinearRegressionSummary</code></li> |
| <li><code class="language-plaintext highlighter-rouge">validateParams</code> in <code class="language-plaintext highlighter-rouge">PipelineStage</code></li> |
| <li><code class="language-plaintext highlighter-rouge">validateParams</code> in <code class="language-plaintext highlighter-rouge">Evaluator</code></li> |
| </ul> |
| |
| <h3 class="no_toc" id="deprecations-and-changes-of-behavior-3">Deprecations and changes of behavior</h3> |
| |
| <p><strong>Deprecations</strong></p> |
| |
| <ul> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-18592">SPARK-18592</a>: |
| Deprecate all Param setter methods except for input/output column Params for <code class="language-plaintext highlighter-rouge">DecisionTreeClassificationModel</code>, <code class="language-plaintext highlighter-rouge">GBTClassificationModel</code>, <code class="language-plaintext highlighter-rouge">RandomForestClassificationModel</code>, <code class="language-plaintext highlighter-rouge">DecisionTreeRegressionModel</code>, <code class="language-plaintext highlighter-rouge">GBTRegressionModel</code> and <code class="language-plaintext highlighter-rouge">RandomForestRegressionModel</code></li> |
| </ul> |
| |
| <p><strong>Changes of behavior</strong></p> |
| |
| <ul> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-17870">SPARK-17870</a>: |
| Fix a bug of <code class="language-plaintext highlighter-rouge">ChiSqSelector</code> which will likely change its result. Now <code class="language-plaintext highlighter-rouge">ChiSquareSelector</code> use pValue rather than raw statistic to select a fixed number of top features.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-3261">SPARK-3261</a>: |
| <code class="language-plaintext highlighter-rouge">KMeans</code> returns potentially fewer than k cluster centers in cases where k distinct centroids aren’t available or aren’t selected.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-17389">SPARK-17389</a>: |
| <code class="language-plaintext highlighter-rouge">KMeans</code> reduces the default number of steps from 5 to 2 for the k-means|| initialization mode.</li> |
| </ul> |
| |
| <h2 id="upgrading-from-mllib-16-to-20">Upgrading from MLlib 1.6 to 2.0</h2> |
| |
| <h3 class="no_toc" id="breaking-changes-4">Breaking changes</h3> |
| |
| <p>There were several breaking changes in Spark 2.0, which are outlined below.</p> |
| |
| <p><strong>Linear algebra classes for DataFrame-based APIs</strong></p> |
| |
| <p>Spark’s linear algebra dependencies were moved to a new project, <code class="language-plaintext highlighter-rouge">mllib-local</code> |
| (see <a href="https://issues.apache.org/jira/browse/SPARK-13944">SPARK-13944</a>). |
| As part of this change, the linear algebra classes were copied to a new package, <code class="language-plaintext highlighter-rouge">spark.ml.linalg</code>. |
| The DataFrame-based APIs in <code class="language-plaintext highlighter-rouge">spark.ml</code> now depend on the <code class="language-plaintext highlighter-rouge">spark.ml.linalg</code> classes, |
| leading to a few breaking changes, predominantly in various model classes |
| (see <a href="https://issues.apache.org/jira/browse/SPARK-14810">SPARK-14810</a> for a full list).</p> |
| |
| <p><strong>Note:</strong> the RDD-based APIs in <code class="language-plaintext highlighter-rouge">spark.mllib</code> continue to depend on the previous package <code class="language-plaintext highlighter-rouge">spark.mllib.linalg</code>.</p> |
| |
| <p><em>Converting vectors and matrices</em></p> |
| |
| <p>While most pipeline components support backward compatibility for loading, |
| some existing <code class="language-plaintext highlighter-rouge">DataFrames</code> and pipelines in Spark versions prior to 2.0, that contain vector or matrix |
| columns, may need to be migrated to the new <code class="language-plaintext highlighter-rouge">spark.ml</code> vector and matrix types. |
| Utilities for converting <code class="language-plaintext highlighter-rouge">DataFrame</code> columns from <code class="language-plaintext highlighter-rouge">spark.mllib.linalg</code> to <code class="language-plaintext highlighter-rouge">spark.ml.linalg</code> types |
| (and vice versa) can be found in <code class="language-plaintext highlighter-rouge">spark.mllib.util.MLUtils</code>.</p> |
| |
| <p>There are also utility methods available for converting single instances of |
| vectors and matrices. Use the <code class="language-plaintext highlighter-rouge">asML</code> method on a <code class="language-plaintext highlighter-rouge">mllib.linalg.Vector</code> / <code class="language-plaintext highlighter-rouge">mllib.linalg.Matrix</code> |
| for converting to <code class="language-plaintext highlighter-rouge">ml.linalg</code> types, and |
| <code class="language-plaintext highlighter-rouge">mllib.linalg.Vectors.fromML</code> / <code class="language-plaintext highlighter-rouge">mllib.linalg.Matrices.fromML</code> |
| for converting to <code class="language-plaintext highlighter-rouge">mllib.linalg</code> types.</p> |
| |
| <div class="codetabs"> |
| <div data-lang="scala"> |
| |
| <figure class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span> |
| |
| <span class="c1">// convert DataFrame columns</span> |
| <span class="k">val</span> <span class="nv">convertedVecDF</span> <span class="k">=</span> <span class="nv">MLUtils</span><span class="o">.</span><span class="py">convertVectorColumnsToML</span><span class="o">(</span><span class="n">vecDF</span><span class="o">)</span> |
| <span class="k">val</span> <span class="nv">convertedMatrixDF</span> <span class="k">=</span> <span class="nv">MLUtils</span><span class="o">.</span><span class="py">convertMatrixColumnsToML</span><span class="o">(</span><span class="n">matrixDF</span><span class="o">)</span> |
| <span class="c1">// convert a single vector or matrix</span> |
| <span class="k">val</span> <span class="nv">mlVec</span><span class="k">:</span> <span class="kt">org.apache.spark.ml.linalg.Vector</span> <span class="o">=</span> <span class="nv">mllibVec</span><span class="o">.</span><span class="py">asML</span> |
| <span class="k">val</span> <span class="nv">mlMat</span><span class="k">:</span> <span class="kt">org.apache.spark.ml.linalg.Matrix</span> <span class="o">=</span> <span class="nv">mllibMat</span><span class="o">.</span><span class="py">asML</span></code></pre></figure> |
| |
| <p>Refer to the <a href="api/scala/org/apache/spark/mllib/util/MLUtils$.html"><code class="language-plaintext highlighter-rouge">MLUtils</code> Scala docs</a> for further detail.</p> |
| </div> |
| |
| <div data-lang="java"> |
| |
| <figure class="highlight"><pre><code class="language-java" data-lang="java"><span class="kn">import</span> <span class="nn">org.apache.spark.mllib.util.MLUtils</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| |
| <span class="c1">// convert DataFrame columns</span> |
| <span class="nc">Dataset</span><span class="o"><</span><span class="nc">Row</span><span class="o">></span> <span class="n">convertedVecDF</span> <span class="o">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="na">convertVectorColumnsToML</span><span class="o">(</span><span class="n">vecDF</span><span class="o">);</span> |
| <span class="nc">Dataset</span><span class="o"><</span><span class="nc">Row</span><span class="o">></span> <span class="n">convertedMatrixDF</span> <span class="o">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="na">convertMatrixColumnsToML</span><span class="o">(</span><span class="n">matrixDF</span><span class="o">);</span> |
| <span class="c1">// convert a single vector or matrix</span> |
| <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">ml</span><span class="o">.</span><span class="na">linalg</span><span class="o">.</span><span class="na">Vector</span> <span class="n">mlVec</span> <span class="o">=</span> <span class="n">mllibVec</span><span class="o">.</span><span class="na">asML</span><span class="o">();</span> |
| <span class="n">org</span><span class="o">.</span><span class="na">apache</span><span class="o">.</span><span class="na">spark</span><span class="o">.</span><span class="na">ml</span><span class="o">.</span><span class="na">linalg</span><span class="o">.</span><span class="na">Matrix</span> <span class="n">mlMat</span> <span class="o">=</span> <span class="n">mllibMat</span><span class="o">.</span><span class="na">asML</span><span class="o">();</span></code></pre></figure> |
| |
| <p>Refer to the <a href="api/java/org/apache/spark/mllib/util/MLUtils.html"><code class="language-plaintext highlighter-rouge">MLUtils</code> Java docs</a> for further detail.</p> |
| </div> |
| |
| <div data-lang="python"> |
| |
| <figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">from</span> <span class="nn">pyspark.mllib.util</span> <span class="kn">import</span> <span class="n">MLUtils</span> |
| |
| <span class="c1"># convert DataFrame columns |
| </span><span class="n">convertedVecDF</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="p">.</span><span class="n">convertVectorColumnsToML</span><span class="p">(</span><span class="n">vecDF</span><span class="p">)</span> |
| <span class="n">convertedMatrixDF</span> <span class="o">=</span> <span class="n">MLUtils</span><span class="p">.</span><span class="n">convertMatrixColumnsToML</span><span class="p">(</span><span class="n">matrixDF</span><span class="p">)</span> |
| <span class="c1"># convert a single vector or matrix |
| </span><span class="n">mlVec</span> <span class="o">=</span> <span class="n">mllibVec</span><span class="p">.</span><span class="n">asML</span><span class="p">()</span> |
| <span class="n">mlMat</span> <span class="o">=</span> <span class="n">mllibMat</span><span class="p">.</span><span class="n">asML</span><span class="p">()</span></code></pre></figure> |
| |
| <p>Refer to the <a href="api/python/reference/api/pyspark.mllib.util.MLUtils.html#pyspark.mllib.util.MLUtils"><code class="language-plaintext highlighter-rouge">MLUtils</code> Python docs</a> for further detail.</p> |
| </div> |
| </div> |
| |
| <p><strong>Deprecated methods removed</strong></p> |
| |
| <p>Several deprecated methods were removed in the <code class="language-plaintext highlighter-rouge">spark.mllib</code> and <code class="language-plaintext highlighter-rouge">spark.ml</code> packages:</p> |
| |
| <ul> |
| <li><code class="language-plaintext highlighter-rouge">setScoreCol</code> in <code class="language-plaintext highlighter-rouge">ml.evaluation.BinaryClassificationEvaluator</code></li> |
| <li><code class="language-plaintext highlighter-rouge">weights</code> in <code class="language-plaintext highlighter-rouge">LinearRegression</code> and <code class="language-plaintext highlighter-rouge">LogisticRegression</code> in <code class="language-plaintext highlighter-rouge">spark.ml</code></li> |
| <li><code class="language-plaintext highlighter-rouge">setMaxNumIterations</code> in <code class="language-plaintext highlighter-rouge">mllib.optimization.LBFGS</code> (marked as <code class="language-plaintext highlighter-rouge">DeveloperApi</code>)</li> |
| <li><code class="language-plaintext highlighter-rouge">treeReduce</code> and <code class="language-plaintext highlighter-rouge">treeAggregate</code> in <code class="language-plaintext highlighter-rouge">mllib.rdd.RDDFunctions</code> (these functions are available on <code class="language-plaintext highlighter-rouge">RDD</code>s directly, and were marked as <code class="language-plaintext highlighter-rouge">DeveloperApi</code>)</li> |
| <li><code class="language-plaintext highlighter-rouge">defaultStrategy</code> in <code class="language-plaintext highlighter-rouge">mllib.tree.configuration.Strategy</code></li> |
| <li><code class="language-plaintext highlighter-rouge">build</code> in <code class="language-plaintext highlighter-rouge">mllib.tree.Node</code></li> |
| <li>libsvm loaders for multiclass and load/save labeledData methods in <code class="language-plaintext highlighter-rouge">mllib.util.MLUtils</code></li> |
| </ul> |
| |
| <p>A full list of breaking changes can be found at <a href="https://issues.apache.org/jira/browse/SPARK-14810">SPARK-14810</a>.</p> |
| |
| <h3 class="no_toc" id="deprecations-and-changes-of-behavior-4">Deprecations and changes of behavior</h3> |
| |
| <p><strong>Deprecations</strong></p> |
| |
| <p>Deprecations in the <code class="language-plaintext highlighter-rouge">spark.mllib</code> and <code class="language-plaintext highlighter-rouge">spark.ml</code> packages include:</p> |
| |
| <ul> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-14984">SPARK-14984</a>: |
| In <code class="language-plaintext highlighter-rouge">spark.ml.regression.LinearRegressionSummary</code>, the <code class="language-plaintext highlighter-rouge">model</code> field has been deprecated.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-13784">SPARK-13784</a>: |
| In <code class="language-plaintext highlighter-rouge">spark.ml.regression.RandomForestRegressionModel</code> and <code class="language-plaintext highlighter-rouge">spark.ml.classification.RandomForestClassificationModel</code>, |
| the <code class="language-plaintext highlighter-rouge">numTrees</code> parameter has been deprecated in favor of <code class="language-plaintext highlighter-rouge">getNumTrees</code> method.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-13761">SPARK-13761</a>: |
| In <code class="language-plaintext highlighter-rouge">spark.ml.param.Params</code>, the <code class="language-plaintext highlighter-rouge">validateParams</code> method has been deprecated. |
| We move all functionality in overridden methods to the corresponding <code class="language-plaintext highlighter-rouge">transformSchema</code>.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-14829">SPARK-14829</a>: |
| In <code class="language-plaintext highlighter-rouge">spark.mllib</code> package, <code class="language-plaintext highlighter-rouge">LinearRegressionWithSGD</code>, <code class="language-plaintext highlighter-rouge">LassoWithSGD</code>, <code class="language-plaintext highlighter-rouge">RidgeRegressionWithSGD</code> and <code class="language-plaintext highlighter-rouge">LogisticRegressionWithSGD</code> have been deprecated. |
| We encourage users to use <code class="language-plaintext highlighter-rouge">spark.ml.regression.LinearRegression</code> and <code class="language-plaintext highlighter-rouge">spark.ml.classification.LogisticRegression</code>.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-14900">SPARK-14900</a>: |
| In <code class="language-plaintext highlighter-rouge">spark.mllib.evaluation.MulticlassMetrics</code>, the parameters <code class="language-plaintext highlighter-rouge">precision</code>, <code class="language-plaintext highlighter-rouge">recall</code> and <code class="language-plaintext highlighter-rouge">fMeasure</code> have been deprecated in favor of <code class="language-plaintext highlighter-rouge">accuracy</code>.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-15644">SPARK-15644</a>: |
| In <code class="language-plaintext highlighter-rouge">spark.ml.util.MLReader</code> and <code class="language-plaintext highlighter-rouge">spark.ml.util.MLWriter</code>, the <code class="language-plaintext highlighter-rouge">context</code> method has been deprecated in favor of <code class="language-plaintext highlighter-rouge">session</code>.</li> |
| <li>In <code class="language-plaintext highlighter-rouge">spark.ml.feature.ChiSqSelectorModel</code>, the <code class="language-plaintext highlighter-rouge">setLabelCol</code> method has been deprecated since it was not used by <code class="language-plaintext highlighter-rouge">ChiSqSelectorModel</code>.</li> |
| </ul> |
| |
| <p><strong>Changes of behavior</strong></p> |
| |
| <p>Changes of behavior in the <code class="language-plaintext highlighter-rouge">spark.mllib</code> and <code class="language-plaintext highlighter-rouge">spark.ml</code> packages include:</p> |
| |
| <ul> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-7780">SPARK-7780</a>: |
| <code class="language-plaintext highlighter-rouge">spark.mllib.classification.LogisticRegressionWithLBFGS</code> directly calls <code class="language-plaintext highlighter-rouge">spark.ml.classification.LogisticRegression</code> for binary classification now. |
| This will introduce the following behavior changes for <code class="language-plaintext highlighter-rouge">spark.mllib.classification.LogisticRegressionWithLBFGS</code>: |
| <ul> |
| <li>The intercept will not be regularized when training binary classification model with L1/L2 Updater.</li> |
| <li>If users set without regularization, training with or without feature scaling will return the same solution by the same convergence rate.</li> |
| </ul> |
| </li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-13429">SPARK-13429</a>: |
| In order to provide better and consistent result with <code class="language-plaintext highlighter-rouge">spark.ml.classification.LogisticRegression</code>, |
| the default value of <code class="language-plaintext highlighter-rouge">spark.mllib.classification.LogisticRegressionWithLBFGS</code>: <code class="language-plaintext highlighter-rouge">convergenceTol</code> has been changed from 1E-4 to 1E-6.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-12363">SPARK-12363</a>: |
| Fix a bug of <code class="language-plaintext highlighter-rouge">PowerIterationClustering</code> which will likely change its result.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-13048">SPARK-13048</a>: |
| <code class="language-plaintext highlighter-rouge">LDA</code> using the <code class="language-plaintext highlighter-rouge">EM</code> optimizer will keep the last checkpoint by default, if checkpointing is being used.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-12153">SPARK-12153</a>: |
| <code class="language-plaintext highlighter-rouge">Word2Vec</code> now respects sentence boundaries. Previously, it did not handle them correctly.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-10574">SPARK-10574</a>: |
| <code class="language-plaintext highlighter-rouge">HashingTF</code> uses <code class="language-plaintext highlighter-rouge">MurmurHash3</code> as default hash algorithm in both <code class="language-plaintext highlighter-rouge">spark.ml</code> and <code class="language-plaintext highlighter-rouge">spark.mllib</code>.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-14768">SPARK-14768</a>: |
| The <code class="language-plaintext highlighter-rouge">expectedType</code> argument for PySpark <code class="language-plaintext highlighter-rouge">Param</code> was removed.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-14931">SPARK-14931</a>: |
| Some default <code class="language-plaintext highlighter-rouge">Param</code> values, which were mismatched between pipelines in Scala and Python, have been changed.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-13600">SPARK-13600</a>: |
| <code class="language-plaintext highlighter-rouge">QuantileDiscretizer</code> now uses <code class="language-plaintext highlighter-rouge">spark.sql.DataFrameStatFunctions.approxQuantile</code> to find splits (previously used custom sampling logic). |
| The output buckets will differ for same input data and params.</li> |
| </ul> |
| |
| <h2 id="upgrading-from-mllib-15-to-16">Upgrading from MLlib 1.5 to 1.6</h2> |
| |
| <p>There are no breaking API changes in the <code class="language-plaintext highlighter-rouge">spark.mllib</code> or <code class="language-plaintext highlighter-rouge">spark.ml</code> packages, but there are |
| deprecations and changes of behavior.</p> |
| |
| <p>Deprecations:</p> |
| |
| <ul> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-11358">SPARK-11358</a>: |
| In <code class="language-plaintext highlighter-rouge">spark.mllib.clustering.KMeans</code>, the <code class="language-plaintext highlighter-rouge">runs</code> parameter has been deprecated.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-10592">SPARK-10592</a>: |
| In <code class="language-plaintext highlighter-rouge">spark.ml.classification.LogisticRegressionModel</code> and |
| <code class="language-plaintext highlighter-rouge">spark.ml.regression.LinearRegressionModel</code>, the <code class="language-plaintext highlighter-rouge">weights</code> field has been deprecated in favor of |
| the new name <code class="language-plaintext highlighter-rouge">coefficients</code>. This helps disambiguate from instance (row) “weights” given to |
| algorithms.</li> |
| </ul> |
| |
| <p>Changes of behavior:</p> |
| |
| <ul> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-7770">SPARK-7770</a>: |
| <code class="language-plaintext highlighter-rouge">spark.mllib.tree.GradientBoostedTrees</code>: <code class="language-plaintext highlighter-rouge">validationTol</code> has changed semantics in 1.6. |
| Previously, it was a threshold for absolute change in error. Now, it resembles the behavior of |
| <code class="language-plaintext highlighter-rouge">GradientDescent</code>’s <code class="language-plaintext highlighter-rouge">convergenceTol</code>: For large errors, it uses relative error (relative to the |
| previous error); for small errors (<code class="language-plaintext highlighter-rouge">< 0.01</code>), it uses absolute error.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-11069">SPARK-11069</a>: |
| <code class="language-plaintext highlighter-rouge">spark.ml.feature.RegexTokenizer</code>: Previously, it did not convert strings to lowercase before |
| tokenizing. Now, it converts to lowercase by default, with an option not to. This matches the |
| behavior of the simpler <code class="language-plaintext highlighter-rouge">Tokenizer</code> transformer.</li> |
| </ul> |
| |
| <h2 id="upgrading-from-mllib-14-to-15">Upgrading from MLlib 1.4 to 1.5</h2> |
| |
| <p>In the <code class="language-plaintext highlighter-rouge">spark.mllib</code> package, there are no breaking API changes but several behavior changes:</p> |
| |
| <ul> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-9005">SPARK-9005</a>: |
| <code class="language-plaintext highlighter-rouge">RegressionMetrics.explainedVariance</code> returns the average regression sum of squares.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-8600">SPARK-8600</a>: <code class="language-plaintext highlighter-rouge">NaiveBayesModel.labels</code> become |
| sorted.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-3382">SPARK-3382</a>: <code class="language-plaintext highlighter-rouge">GradientDescent</code> has a default |
| convergence tolerance <code class="language-plaintext highlighter-rouge">1e-3</code>, and hence iterations might end earlier than 1.4.</li> |
| </ul> |
| |
| <p>In the <code class="language-plaintext highlighter-rouge">spark.ml</code> package, there exists one breaking API change and one behavior change:</p> |
| |
| <ul> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-9268">SPARK-9268</a>: Java’s varargs support is removed |
| from <code class="language-plaintext highlighter-rouge">Params.setDefault</code> due to a |
| <a href="https://issues.scala-lang.org/browse/SI-9013">Scala compiler bug</a>.</li> |
| <li><a href="https://issues.apache.org/jira/browse/SPARK-10097">SPARK-10097</a>: <code class="language-plaintext highlighter-rouge">Evaluator.isLargerBetter</code> is |
| added to indicate metric ordering. Metrics like RMSE no longer flip signs as in 1.4.</li> |
| </ul> |
| |
| <h2 id="upgrading-from-mllib-13-to-14">Upgrading from MLlib 1.3 to 1.4</h2> |
| |
| <p>In the <code class="language-plaintext highlighter-rouge">spark.mllib</code> package, there were several breaking changes, but all in <code class="language-plaintext highlighter-rouge">DeveloperApi</code> or <code class="language-plaintext highlighter-rouge">Experimental</code> APIs:</p> |
| |
| <ul> |
| <li>Gradient-Boosted Trees |
| <ul> |
| <li><em>(Breaking change)</em> The signature of the <a href="api/scala/org/apache/spark/mllib/tree/loss/Loss.html"><code class="language-plaintext highlighter-rouge">Loss.gradient</code></a> method was changed. This is only an issues for users who wrote their own losses for GBTs.</li> |
| <li><em>(Breaking change)</em> The <code class="language-plaintext highlighter-rouge">apply</code> and <code class="language-plaintext highlighter-rouge">copy</code> methods for the case class <a href="api/scala/org/apache/spark/mllib/tree/configuration/BoostingStrategy.html"><code class="language-plaintext highlighter-rouge">BoostingStrategy</code></a> have been changed because of a modification to the case class fields. This could be an issue for users who use <code class="language-plaintext highlighter-rouge">BoostingStrategy</code> to set GBT parameters.</li> |
| </ul> |
| </li> |
| <li><em>(Breaking change)</em> The return value of <a href="api/scala/org/apache/spark/mllib/clustering/LDA.html"><code class="language-plaintext highlighter-rouge">LDA.run</code></a> has changed. It now returns an abstract class <code class="language-plaintext highlighter-rouge">LDAModel</code> instead of the concrete class <code class="language-plaintext highlighter-rouge">DistributedLDAModel</code>. The object of type <code class="language-plaintext highlighter-rouge">LDAModel</code> can still be cast to the appropriate concrete type, which depends on the optimization algorithm.</li> |
| </ul> |
| |
| <p>In the <code class="language-plaintext highlighter-rouge">spark.ml</code> package, several major API changes occurred, including:</p> |
| |
| <ul> |
| <li><code class="language-plaintext highlighter-rouge">Param</code> and other APIs for specifying parameters</li> |
| <li><code class="language-plaintext highlighter-rouge">uid</code> unique IDs for Pipeline components</li> |
| <li>Reorganization of certain classes</li> |
| </ul> |
| |
| <p>Since the <code class="language-plaintext highlighter-rouge">spark.ml</code> API was an alpha component in Spark 1.3, we do not list all changes here. |
| However, since 1.4 <code class="language-plaintext highlighter-rouge">spark.ml</code> is no longer an alpha component, we will provide details on any API |
| changes for future releases.</p> |
| |
| <h2 id="upgrading-from-mllib-12-to-13">Upgrading from MLlib 1.2 to 1.3</h2> |
| |
| <p>In the <code class="language-plaintext highlighter-rouge">spark.mllib</code> package, there were several breaking changes. The first change (in <code class="language-plaintext highlighter-rouge">ALS</code>) is the only one in a component not marked as Alpha or Experimental.</p> |
| |
| <ul> |
| <li><em>(Breaking change)</em> In <a href="api/scala/org/apache/spark/mllib/recommendation/ALS.html"><code class="language-plaintext highlighter-rouge">ALS</code></a>, the extraneous method <code class="language-plaintext highlighter-rouge">solveLeastSquares</code> has been removed. The <code class="language-plaintext highlighter-rouge">DeveloperApi</code> method <code class="language-plaintext highlighter-rouge">analyzeBlocks</code> was also removed.</li> |
| <li><em>(Breaking change)</em> <a href="api/scala/org/apache/spark/mllib/feature/StandardScalerModel.html"><code class="language-plaintext highlighter-rouge">StandardScalerModel</code></a> remains an Alpha component. In it, the <code class="language-plaintext highlighter-rouge">variance</code> method has been replaced with the <code class="language-plaintext highlighter-rouge">std</code> method. To compute the column variance values returned by the original <code class="language-plaintext highlighter-rouge">variance</code> method, simply square the standard deviation values returned by <code class="language-plaintext highlighter-rouge">std</code>.</li> |
| <li><em>(Breaking change)</em> <a href="api/scala/org/apache/spark/mllib/regression/StreamingLinearRegressionWithSGD.html"><code class="language-plaintext highlighter-rouge">StreamingLinearRegressionWithSGD</code></a> remains an Experimental component. In it, there were two changes: |
| <ul> |
| <li>The constructor taking arguments was removed in favor of a builder pattern using the default constructor plus parameter setter methods.</li> |
| <li>Variable <code class="language-plaintext highlighter-rouge">model</code> is no longer public.</li> |
| </ul> |
| </li> |
| <li><em>(Breaking change)</em> <a href="api/scala/org/apache/spark/mllib/tree/DecisionTree.html"><code class="language-plaintext highlighter-rouge">DecisionTree</code></a> remains an Experimental component. In it and its associated classes, there were several changes: |
| <ul> |
| <li>In <code class="language-plaintext highlighter-rouge">DecisionTree</code>, the deprecated class method <code class="language-plaintext highlighter-rouge">train</code> has been removed. (The object/static <code class="language-plaintext highlighter-rouge">train</code> methods remain.)</li> |
| <li>In <code class="language-plaintext highlighter-rouge">Strategy</code>, the <code class="language-plaintext highlighter-rouge">checkpointDir</code> parameter has been removed. Checkpointing is still supported, but the checkpoint directory must be set before calling tree and tree ensemble training.</li> |
| </ul> |
| </li> |
| <li><code class="language-plaintext highlighter-rouge">PythonMLlibAPI</code> (the interface between Scala/Java and Python for MLlib) was a public API but is now private, declared <code class="language-plaintext highlighter-rouge">private[python]</code>. This was never meant for external use.</li> |
| <li>In linear regression (including Lasso and ridge regression), the squared loss is now divided by 2. |
| So in order to produce the same result as in 1.2, the regularization parameter needs to be divided by 2 and the step size needs to be multiplied by 2.</li> |
| </ul> |
| |
| <p>In the <code class="language-plaintext highlighter-rouge">spark.ml</code> package, the main API changes are from Spark SQL. We list the most important changes here:</p> |
| |
| <ul> |
| <li>The old <a href="https://spark.apache.org/docs/1.2.1/api/scala/index.html#org.apache.spark.sql.SchemaRDD">SchemaRDD</a> has been replaced with <a href="api/scala/org/apache/spark/sql/DataFrame.html">DataFrame</a> with a somewhat modified API. All algorithms in <code class="language-plaintext highlighter-rouge">spark.ml</code> which used to use SchemaRDD now use DataFrame.</li> |
| <li>In Spark 1.2, we used implicit conversions from <code class="language-plaintext highlighter-rouge">RDD</code>s of <code class="language-plaintext highlighter-rouge">LabeledPoint</code> into <code class="language-plaintext highlighter-rouge">SchemaRDD</code>s by calling <code class="language-plaintext highlighter-rouge">import sqlContext._</code> where <code class="language-plaintext highlighter-rouge">sqlContext</code> was an instance of <code class="language-plaintext highlighter-rouge">SQLContext</code>. These implicits have been moved, so we now call <code class="language-plaintext highlighter-rouge">import sqlContext.implicits._</code>.</li> |
| <li>Java APIs for SQL have also changed accordingly. Please see the examples above and the <a href="sql-programming-guide.html">Spark SQL Programming Guide</a> for details.</li> |
| </ul> |
| |
| <p>Other changes were in <code class="language-plaintext highlighter-rouge">LogisticRegression</code>:</p> |
| |
| <ul> |
| <li>The <code class="language-plaintext highlighter-rouge">scoreCol</code> output column (with default value “score”) was renamed to be <code class="language-plaintext highlighter-rouge">probabilityCol</code> (with default value “probability”). The type was originally <code class="language-plaintext highlighter-rouge">Double</code> (for the probability of class 1.0), but it is now <code class="language-plaintext highlighter-rouge">Vector</code> (for the probability of each class, to support multiclass classification in the future).</li> |
| <li>In Spark 1.2, <code class="language-plaintext highlighter-rouge">LogisticRegressionModel</code> did not include an intercept. In Spark 1.3, it includes an intercept; however, it will always be 0.0 since it uses the default settings for <a href="api/scala/org/apache/spark/mllib/classification/LogisticRegressionWithLBFGS.html">spark.mllib.LogisticRegressionWithLBFGS</a>. The option to use an intercept will be added in the future.</li> |
| </ul> |
| |
| <h2 id="upgrading-from-mllib-11-to-12">Upgrading from MLlib 1.1 to 1.2</h2> |
| |
| <p>The only API changes in MLlib v1.2 are in |
| <a href="api/scala/org/apache/spark/mllib/tree/DecisionTree.html"><code class="language-plaintext highlighter-rouge">DecisionTree</code></a>, |
| which continues to be an experimental API in MLlib 1.2:</p> |
| |
| <ol> |
| <li> |
| <p><em>(Breaking change)</em> The Scala API for classification takes a named argument specifying the number |
| of classes. In MLlib v1.1, this argument was called <code class="language-plaintext highlighter-rouge">numClasses</code> in Python and |
| <code class="language-plaintext highlighter-rouge">numClassesForClassification</code> in Scala. In MLlib v1.2, the names are both set to <code class="language-plaintext highlighter-rouge">numClasses</code>. |
| This <code class="language-plaintext highlighter-rouge">numClasses</code> parameter is specified either via |
| <a href="api/scala/org/apache/spark/mllib/tree/configuration/Strategy.html"><code class="language-plaintext highlighter-rouge">Strategy</code></a> |
| or via <a href="api/scala/org/apache/spark/mllib/tree/DecisionTree.html"><code class="language-plaintext highlighter-rouge">DecisionTree</code></a> |
| static <code class="language-plaintext highlighter-rouge">trainClassifier</code> and <code class="language-plaintext highlighter-rouge">trainRegressor</code> methods.</p> |
| </li> |
| <li> |
| <p><em>(Breaking change)</em> The API for |
| <a href="api/scala/org/apache/spark/mllib/tree/model/Node.html"><code class="language-plaintext highlighter-rouge">Node</code></a> has changed. |
| This should generally not affect user code, unless the user manually constructs decision trees |
| (instead of using the <code class="language-plaintext highlighter-rouge">trainClassifier</code> or <code class="language-plaintext highlighter-rouge">trainRegressor</code> methods). |
| The tree <code class="language-plaintext highlighter-rouge">Node</code> now includes more information, including the probability of the predicted label |
| (for classification).</p> |
| </li> |
| <li> |
| <p>Printing methods’ output has changed. The <code class="language-plaintext highlighter-rouge">toString</code> (Scala/Java) and <code class="language-plaintext highlighter-rouge">__repr__</code> (Python) methods used to print the full model; they now print a summary. For the full model, use <code class="language-plaintext highlighter-rouge">toDebugString</code>.</p> |
| </li> |
| </ol> |
| |
| <p>Examples in the Spark distribution and examples in the |
| <a href="mllib-decision-tree.html#examples">Decision Trees Guide</a> have been updated accordingly.</p> |
| |
| <h2 id="upgrading-from-mllib-10-to-11">Upgrading from MLlib 1.0 to 1.1</h2> |
| |
| <p>The only API changes in MLlib v1.1 are in |
| <a href="api/scala/org/apache/spark/mllib/tree/DecisionTree.html"><code class="language-plaintext highlighter-rouge">DecisionTree</code></a>, |
| which continues to be an experimental API in MLlib 1.1:</p> |
| |
| <ol> |
| <li> |
| <p><em>(Breaking change)</em> The meaning of tree depth has been changed by 1 in order to match |
| the implementations of trees in |
| <a href="http://scikit-learn.org/stable/modules/classes.html#module-sklearn.tree">scikit-learn</a> |
| and in <a href="http://cran.r-project.org/web/packages/rpart/index.html">rpart</a>. |
| In MLlib v1.0, a depth-1 tree had 1 leaf node, and a depth-2 tree had 1 root node and 2 leaf nodes. |
| In MLlib v1.1, a depth-0 tree has 1 leaf node, and a depth-1 tree has 1 root node and 2 leaf nodes. |
| This depth is specified by the <code class="language-plaintext highlighter-rouge">maxDepth</code> parameter in |
| <a href="api/scala/org/apache/spark/mllib/tree/configuration/Strategy.html"><code class="language-plaintext highlighter-rouge">Strategy</code></a> |
| or via <a href="api/scala/org/apache/spark/mllib/tree/DecisionTree.html"><code class="language-plaintext highlighter-rouge">DecisionTree</code></a> |
| static <code class="language-plaintext highlighter-rouge">trainClassifier</code> and <code class="language-plaintext highlighter-rouge">trainRegressor</code> methods.</p> |
| </li> |
| <li> |
| <p><em>(Non-breaking change)</em> We recommend using the newly added <code class="language-plaintext highlighter-rouge">trainClassifier</code> and <code class="language-plaintext highlighter-rouge">trainRegressor</code> |
| methods to build a <a href="api/scala/org/apache/spark/mllib/tree/DecisionTree.html"><code class="language-plaintext highlighter-rouge">DecisionTree</code></a>, |
| rather than using the old parameter class <code class="language-plaintext highlighter-rouge">Strategy</code>. These new training methods explicitly |
| separate classification and regression, and they replace specialized parameter types with |
| simple <code class="language-plaintext highlighter-rouge">String</code> types.</p> |
| </li> |
| </ol> |
| |
| <p>Examples of the new recommended <code class="language-plaintext highlighter-rouge">trainClassifier</code> and <code class="language-plaintext highlighter-rouge">trainRegressor</code> are given in the |
| <a href="mllib-decision-tree.html#examples">Decision Trees Guide</a>.</p> |
| |
| <h2 id="upgrading-from-mllib-09-to-10">Upgrading from MLlib 0.9 to 1.0</h2> |
| |
| <p>In MLlib v1.0, we support both dense and sparse input in a unified way, which introduces a few |
| breaking changes. If your data is sparse, please store it in a sparse format instead of dense to |
| take advantage of sparsity in both storage and computation. Details are described below.</p> |
| |
| |
| |
| </div> |
| |
| <!-- /container --> |
| </div> |
| |
| <script src="js/vendor/jquery-3.5.1.min.js"></script> |
| <script src="js/vendor/bootstrap.bundle.min.js"></script> |
| <script src="js/vendor/anchor.min.js"></script> |
| <script src="js/main.js"></script> |
| <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.js"></script> |
| <script type="text/javascript"> |
| // DocSearch is entirely free and automated. DocSearch is built in two parts: |
| // 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link |
| // in your website and extract content from every page it traverses. It then pushes this |
| // content to an Algolia index. |
| // 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index |
| // to your search input and display its results in a dropdown UI. If you want to find more |
| // details on how works DocSearch, check the docs of DocSearch. |
| docsearch({ |
| apiKey: 'd62f962a82bc9abb53471cb7b89da35e', |
| appId: 'RAI69RXRSK', |
| indexName: 'apache_spark', |
| inputSelector: '#docsearch-input', |
| enhancedSearchInput: true, |
| algoliaOptions: { |
| 'facetFilters': ["version:3.3.4"] |
| }, |
| debug: false // Set debug to true if you want to inspect the dropdown |
| }); |
| |
| </script> |
| |
| <!-- MathJax Section --> |
| <script type="text/x-mathjax-config"> |
| MathJax.Hub.Config({ |
| TeX: { equationNumbers: { autoNumber: "AMS" } } |
| }); |
| </script> |
| <script> |
| // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS. |
| // We could use "//cdn.mathjax...", but that won't support "file://". |
| (function(d, script) { |
| script = d.createElement('script'); |
| script.type = 'text/javascript'; |
| script.async = true; |
| script.onload = function(){ |
| MathJax.Hub.Config({ |
| tex2jax: { |
| inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ], |
| displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], |
| processEscapes: true, |
| skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] |
| } |
| }); |
| }; |
| script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + |
| 'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' + |
| '?config=TeX-AMS-MML_HTMLorMML'; |
| d.getElementsByTagName('head')[0].appendChild(script); |
| }(document)); |
| </script> |
| </body> |
| </html> |