blob: 84627c45fb318f709f05ed642395b1835c70ade5 [file] [log] [blame]
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>
Spark Release 2.3.0 | Apache Spark
</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet"
integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,wght@0,400;0,500;0,700;1,400;1,500;1,700&Courier+Prime:wght@400;700&display=swap" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
<!-- Code highlighter CSS -->
<link href="/css/pygments-default.css" rel="stylesheet">
<link rel="icon" href="/favicon.ico" type="image/x-icon">
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body class="global">
<nav class="navbar navbar-expand-lg navbar-dark p-0 px-4" style="background: #1D6890;">
<a class="navbar-brand" href="/">
<img src="/images/spark-logo-rev.svg" alt="" width="141" height="72">
</a>
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarContent"
aria-controls="navbarContent" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse col-md-12 col-lg-auto pt-4" id="navbarContent">
<ul class="navbar-nav me-auto">
<li class="nav-item">
<a class="nav-link active" aria-current="page" href="/downloads.html">Download</a>
</li>
<li class="nav-item dropdown">
<a class="nav-link dropdown-toggle" href="#" id="libraries" role="button" data-bs-toggle="dropdown"
aria-expanded="false">
Libraries
</a>
<ul class="dropdown-menu" aria-labelledby="libraries">
<li><a class="dropdown-item" href="/sql/">SQL and DataFrames</a></li>
<li><a class="dropdown-item" href="/spark-connect/">Spark Connect</a></li>
<li><a class="dropdown-item" href="/streaming/">Spark Streaming</a></li>
<li><a class="dropdown-item" href="/pandas-on-spark/">pandas on Spark</a></li>
<li><a class="dropdown-item" href="/mllib/">MLlib (machine learning)</a></li>
<li><a class="dropdown-item" href="/graphx/">GraphX (graph)</a></li>
<li>
<hr class="dropdown-divider">
</li>
<li><a class="dropdown-item" href="/third-party-projects.html">Third-Party Projects</a></li>
</ul>
</li>
<li class="nav-item dropdown">
<a class="nav-link dropdown-toggle" href="#" id="documentation" role="button" data-bs-toggle="dropdown"
aria-expanded="false">
Documentation
</a>
<ul class="dropdown-menu" aria-labelledby="documentation">
<li><a class="dropdown-item" href="/docs/latest/">Latest Release</a></li>
<li><a class="dropdown-item" href="/documentation.html">Older Versions and Other Resources</a></li>
<li><a class="dropdown-item" href="/faq.html">Frequently Asked Questions</a></li>
</ul>
</li>
<li class="nav-item">
<a class="nav-link active" aria-current="page" href="/examples.html">Examples</a>
</li>
<li class="nav-item dropdown">
<a class="nav-link dropdown-toggle" href="#" id="community" role="button" data-bs-toggle="dropdown"
aria-expanded="false">
Community
</a>
<ul class="dropdown-menu" aria-labelledby="community">
<li><a class="dropdown-item" href="/community.html">Mailing Lists &amp; Resources</a></li>
<li><a class="dropdown-item" href="/contributing.html">Contributing to Spark</a></li>
<li><a class="dropdown-item" href="/improvement-proposals.html">Improvement Proposals (SPIP)</a>
</li>
<li><a class="dropdown-item" href="https://issues.apache.org/jira/browse/SPARK">Issue Tracker</a>
</li>
<li><a class="dropdown-item" href="/powered-by.html">Powered By</a></li>
<li><a class="dropdown-item" href="/committers.html">Project Committers</a></li>
<li><a class="dropdown-item" href="/history.html">Project History</a></li>
</ul>
</li>
<li class="nav-item dropdown">
<a class="nav-link dropdown-toggle" href="#" id="developers" role="button" data-bs-toggle="dropdown"
aria-expanded="false">
Developers
</a>
<ul class="dropdown-menu" aria-labelledby="developers">
<li><a class="dropdown-item" href="/developer-tools.html">Useful Developer Tools</a></li>
<li><a class="dropdown-item" href="/versioning-policy.html">Versioning Policy</a></li>
<li><a class="dropdown-item" href="/release-process.html">Release Process</a></li>
<li><a class="dropdown-item" href="/security.html">Security</a></li>
</ul>
</li>
</ul>
<ul class="navbar-nav ml-auto">
<li class="nav-item dropdown">
<a class="nav-link dropdown-toggle" href="#" id="apacheFoundation" role="button"
data-bs-toggle="dropdown" aria-expanded="false">
Apache Software Foundation
</a>
<ul class="dropdown-menu" aria-labelledby="apacheFoundation">
<li><a class="dropdown-item" href="https://www.apache.org/">Apache Homepage</a></li>
<li><a class="dropdown-item" href="https://www.apache.org/licenses/">License</a></li>
<li><a class="dropdown-item"
href="https://www.apache.org/foundation/sponsorship.html">Sponsorship</a></li>
<li><a class="dropdown-item" href="https://www.apache.org/foundation/thanks.html">Thanks</a></li>
<li><a class="dropdown-item" href="https://www.apache.org/security/">Security</a></li>
<li><a class="dropdown-item" href="https://www.apache.org/events/current-event">Event</a></li>
</ul>
</li>
</ul>
</div>
</nav>
<div class="container">
<div class="row mt-4">
<div class="col-12 col-md-9">
<h2>Spark Release 2.3.0</h2>
<p>Apache Spark 2.3.0 is the fourth release in the 2.x line. This release adds support for Continuous Processing in Structured Streaming along with a brand new Kubernetes Scheduler backend. Other major updates include the new DataSource and Structured Streaming v2 APIs, and a number of PySpark performance enhancements. In addition, this release continues to focus on usability, stability, and polish while resolving around 1400 tickets.</p>
<p>To download Apache Spark 2.3.0, visit the <a href="/downloads.html">downloads</a> page. You can consult JIRA for the <a href="https://s.apache.org/spark-2.3.0">detailed changes</a>. We have curated a list of high level changes here, grouped by major modules.</p>
<ul id="markdown-toc">
<li><a href="#core-pyspark-and-spark-sql" id="markdown-toc-core-pyspark-and-spark-sql">Core, PySpark and Spark SQL</a></li>
<li><a href="#structured-streaming" id="markdown-toc-structured-streaming">Structured Streaming</a></li>
<li><a href="#mllib" id="markdown-toc-mllib">MLlib</a></li>
<li><a href="#sparkr" id="markdown-toc-sparkr">SparkR</a></li>
<li><a href="#graphx" id="markdown-toc-graphx">GraphX</a></li>
<li><a href="#deprecations" id="markdown-toc-deprecations">Deprecations</a></li>
<li><a href="#changes-of-behavior" id="markdown-toc-changes-of-behavior">Changes of behavior</a></li>
<li><a href="#known-issues" id="markdown-toc-known-issues">Known Issues</a></li>
<li><a href="#credits" id="markdown-toc-credits">Credits</a></li>
</ul>
<h3 id="core-pyspark-and-spark-sql">Core, PySpark and Spark SQL</h3>
<ul>
<li><strong>Major features</strong>
<ul>
<li><strong>Spark on Kubernetes</strong>: [<a href="https://issues.apache.org/jira/browse/SPARK-18278">SPARK-18278</a>] A new kubernetes scheduler backend that supports native submission of spark jobs to a cluster managed by kubernetes. Note that this support is currently experimental and behavioral changes around configurations, container images and entrypoints should be expected.</li>
<li><strong>Vectorized ORC Reader</strong>: [<a href="https://issues.apache.org/jira/browse/SPARK-16060">SPARK-16060</a>] Adds support for new ORC reader that substantially improves the ORC scan throughput through vectorization (2-5x). To enable the reader, users can set <code class="language-plaintext highlighter-rouge">spark.sql.orc.impl</code> to <code class="language-plaintext highlighter-rouge">native</code>.</li>
<li><strong>Spark History Server V2</strong>: [<a href="https://issues.apache.org/jira/browse/SPARK-18085">SPARK-18085</a>] A new spark history server (SHS) backend that provides better scalability for large scale applications with a more efficient event storage mechanism.</li>
<li><strong>Data source API V2</strong>: [<a href="https://issues.apache.org/jira/browse/SPARK-15689">SPARK-15689</a>][<a href="https://issues.apache.org/jira/browse/SPARK-22386">SPARK-22386</a>] An experimental API for plugging in new data sources in Spark. The new API attempts to address several limitations of the V1 API and aims to facilitate development of high performant, easy-to-maintain, and extensible external data sources. Note that this API is still undergoing active development and breaking changes should be expected.</li>
<li><strong>PySpark Performance Enhancements</strong>: [<a href="https://issues.apache.org/jira/browse/SPARK-22216">SPARK-22216</a>][<a href="https://issues.apache.org/jira/browse/SPARK-21187">SPARK-21187</a>] Significant improvements in python performance and interoperability by fast data serialization and vectorized execution.</li>
</ul>
</li>
<li><strong>Performance and stability</strong>
<ul>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-21975">SPARK-21975</a>] Histogram support in cost-based optimizer</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-20331">SPARK-20331</a>] Better support for predicate pushdown for Hive partition pruning</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-19112">SPARK-19112</a>] Support for ZStandard compression codec</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-21113">SPARK-21113</a>] Support for read ahead input stream to amortize disk I/O cost in the spill reader</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22510">SPARK-22510</a>][<a href="https://issues.apache.org/jira/browse/SPARK-22692">SPARK-22692</a>][<a href="https://issues.apache.org/jira/browse/SPARK-21871">SPARK-21871</a>] Further stabilize the codegen framework to avoid hitting the <code class="language-plaintext highlighter-rouge">64KB</code> JVM bytecode limit on the Java method and Java compiler constant pool limit</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-23207">SPARK-23207</a>] Fixed a long standing bug in Spark where consecutive shuffle+repartition on a DataFrame could lead to incorrect answers in certain surgical cases</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22062">SPARK-22062</a>][<a href="https://issues.apache.org/jira/browse/SPARK-17788">SPARK-17788</a>][<a href="https://issues.apache.org/jira/browse/SPARK-21907">SPARK-21907</a>] Fix various causes of OOMs</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22489">SPARK-22489</a>][<a href="https://issues.apache.org/jira/browse/SPARK-22916">SPARK-22916</a>][<a href="https://issues.apache.org/jira/browse/SPARK-22895">SPARK-22895</a>][<a href="https://issues.apache.org/jira/browse/SPARK-20758">SPARK-20758</a>][<a href="https://issues.apache.org/jira/browse/SPARK-22266">SPARK-22266</a>][<a href="https://issues.apache.org/jira/browse/SPARK-19122">SPARK-19122</a>][<a href="https://issues.apache.org/jira/browse/SPARK-22662">SPARK-22662</a>][<a href="https://issues.apache.org/jira/browse/SPARK-21652">SPARK-21652</a>] Enhancements in rule-based optimizer and planner</li>
</ul>
</li>
<li><strong>Other notable changes</strong>
<ul>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-20236">SPARK-20236</a>] Support Hive style dynamic partition overwrite semantics.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-4131">SPARK-4131</a>] Support <code class="language-plaintext highlighter-rouge">INSERT OVERWRITE DIRECTORY</code> to directly write data into the filesystem from a query</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-19285">SPARK-19285</a>][<a href="https://issues.apache.org/jira/browse/SPARK-22945">SPARK-22945</a>][<a href="https://issues.apache.org/jira/browse/SPARK-21499">SPARK-21499</a>][<a href="https://issues.apache.org/jira/browse/SPARK-20586">SPARK-20586</a>][<a href="https://issues.apache.org/jira/browse/SPARK-20416">SPARK-20416</a>][<a href="https://issues.apache.org/jira/browse/SPARK-20668">SPARK-20668</a>] UDF enhancements</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-20463">SPARK-20463</a>][<a href="https://issues.apache.org/jira/browse/SPARK-19951">SPARK-19951</a>][<a href="https://issues.apache.org/jira/browse/SPARK-22934">SPARK-22934</a>][<a href="https://issues.apache.org/jira/browse/SPARK-21055">SPARK-21055</a>][<a href="https://issues.apache.org/jira/browse/SPARK-17729">SPARK-17729</a>][<a href="https://issues.apache.org/jira/browse/SPARK-20962">SPARK-20962</a>][<a href="https://issues.apache.org/jira/browse/SPARK-20963">SPARK-20963</a>][<a href="https://issues.apache.org/jira/browse/SPARK-20841">SPARK-20841</a>][<a href="https://issues.apache.org/jira/browse/SPARK-17642">SPARK-17642</a>][<a href="https://issues.apache.org/jira/browse/SPARK-22475">SPARK-22475</a>][<a href="https://issues.apache.org/jira/browse/SPARK-22934">SPARK-22934</a>] Improved ANSI SQL compliance and Hive compatibility</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-20746">SPARK-20746</a>] More comprehensive SQL built-in functions</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-21485">SPARK-21485</a>] Spark SQL documentation generation for built-in functions</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-19810">SPARK-19810</a>] Remove support for Scala <code class="language-plaintext highlighter-rouge">2.10</code></li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22324">SPARK-22324</a>] Upgrade Arrow to <code class="language-plaintext highlighter-rouge">0.8.0</code> and Netty to <code class="language-plaintext highlighter-rouge">4.1.17</code></li>
</ul>
</li>
</ul>
<p><em>Programming guides: <a href="/docs/2.3.0/rdd-programming-guide.html">Spark RDD Programming Guide</a> and <a href="/docs/2.3.0/sql-programming-guide.html">Spark SQL, DataFrames and Datasets Guide</a>.</em></p>
<h3 id="structured-streaming">Structured Streaming</h3>
<ul>
<li><strong>Continuous Processing</strong>
<ul>
<li>A new execution engine that can execute streaming queries with sub-millisecond end-to-end latency by changing only a single line of user code. To learn more see the programming guide.</li>
</ul>
</li>
<li><strong>Stream-Stream Joins</strong>
<ul>
<li>Ability to join two streams of data, buffering rows until matching tuples arrive in the other stream. Predicates can be used against event time columns to bound the amount of state that needs to be retained.</li>
</ul>
</li>
<li><strong>Streaming API V2</strong>
<ul>
<li>An experimental API for plugging in new source and sinks that works for batch, micro-batch, and continuous execution. Note this API is still undergoing active development and breaking changes should be expected.</li>
</ul>
</li>
</ul>
<p><em>Programming guide: <a href="/docs/2.3.0/structured-streaming-programming-guide.html">Structured Streaming Programming Guide</a>.</em></p>
<h3 id="mllib">MLlib</h3>
<ul>
<li><strong>Highlights</strong>
<ul>
<li>ML Prediction now works with Structured Streaming, using updated APIs. Details below.</li>
</ul>
</li>
<li><strong>New/Improved APIs</strong>
<ul>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-21866">SPARK-21866</a>]: Built-in support for reading images into a DataFrame (Scala/Java/Python)</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-19634">SPARK-19634</a>]: DataFrame functions for descriptive summary statistics over vector columns (Scala/Java)</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-14516">SPARK-14516</a>]: <code class="language-plaintext highlighter-rouge">ClusteringEvaluator</code> for tuning clustering algorithms, supporting Cosine silhouette and squared Euclidean silhouette metrics (Scala/Java/Python)</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-3181">SPARK-3181</a>]: Robust linear regression with Huber loss (Scala/Java/Python)</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-13969">SPARK-13969</a>]: <code class="language-plaintext highlighter-rouge">FeatureHasher</code> transformer (Scala/Java/Python)</li>
<li>Multiple column support for several feature transformers:
<ul>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-13030">SPARK-13030</a>]: <code class="language-plaintext highlighter-rouge">OneHotEncoderEstimator</code> (Scala/Java/Python)</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22397">SPARK-22397</a>]: <code class="language-plaintext highlighter-rouge">QuantileDiscretizer</code> (Scala/Java)</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-20542">SPARK-20542</a>]: <code class="language-plaintext highlighter-rouge">Bucketizer</code> (Scala/Java/Python)</li>
</ul>
</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-21633">SPARK-21633</a>] and <a href="https://issues.apache.org/jira/browse/SPARK-21542">SPARK-21542</a>]: Improved support for custom pipeline components in Python.</li>
</ul>
</li>
<li><strong>New Features</strong>
<ul>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-21087">SPARK-21087</a>]: <code class="language-plaintext highlighter-rouge">CrossValidator</code> and <code class="language-plaintext highlighter-rouge">TrainValidationSplit</code> can collect all models when fitting (Scala/Java). This allows you to inspect or save all fitted models.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-19357">SPARK-19357</a>]: Meta-algorithms <code class="language-plaintext highlighter-rouge">CrossValidator</code>, <code class="language-plaintext highlighter-rouge">TrainValidationSplit, </code>OneVsRest` support a parallelism Param for fitting multiple sub-models in parallel Spark jobs</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-17139">SPARK-17139</a>]: Model summary for multinomial logistic regression (Scala/Java/Python)</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-18710">SPARK-18710</a>]: Add offset in GLM</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-20199">SPARK-20199</a>]: Added <code class="language-plaintext highlighter-rouge">featureSubsetStrategy</code> Param to <code class="language-plaintext highlighter-rouge">GBTClassifier</code> and <code class="language-plaintext highlighter-rouge">GBTRegressor</code>. Using this to subsample features can significantly improve training speed; this option has been a key strength of <code class="language-plaintext highlighter-rouge">xgboost</code>.</li>
</ul>
</li>
<li><strong>Other Notable Changes</strong>
<ul>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22156">SPARK-22156</a>] Fixed <code class="language-plaintext highlighter-rouge">Word2Vec</code> learning rate scaling with <code class="language-plaintext highlighter-rouge">num</code> iterations. The new learning rate is set to match the original <code class="language-plaintext highlighter-rouge">Word2Vec</code> C code and should give better results from training.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22289">SPARK-22289</a>] Add <code class="language-plaintext highlighter-rouge">JSON</code> support for Matrix parameters (This fixed a bug for ML persistence with <code class="language-plaintext highlighter-rouge">LogisticRegressionModel</code> when using bounds on coefficients.)</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22700">SPARK-22700</a>] <code class="language-plaintext highlighter-rouge">Bucketizer.transform</code> incorrectly drops row containing <code class="language-plaintext highlighter-rouge">NaN</code>. When Param <code class="language-plaintext highlighter-rouge">handleInvalid</code> was set to “skip,” <code class="language-plaintext highlighter-rouge">Bucketizer</code> would drop a row with a valid value in the input column if another (irrelevant) column had a <code class="language-plaintext highlighter-rouge">NaN</code> value.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22446">SPARK-22446</a>] Catalyst optimizer sometimes caused <code class="language-plaintext highlighter-rouge">StringIndexerModel</code> to throw an incorrect “Unseen label” exception when <code class="language-plaintext highlighter-rouge">handleInvalid</code> was set to “error.” This could happen for filtered data, due to predicate push-down, causing errors even after invalid rows had already been filtered from the input dataset.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-21681">SPARK-21681</a>] Fixed an edge case bug in multinomial logistic regression that resulted in incorrect coefficients when some features had zero variance.</li>
<li>Major optimizations:
<ul>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22707">SPARK-22707</a>] Reduced memory consumption for <code class="language-plaintext highlighter-rouge">CrossValidator</code></li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22949">SPARK-22949</a>] Reduced memory consumption for <code class="language-plaintext highlighter-rouge">TrainValidationSplit</code></li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-21690">SPARK-21690</a>] <code class="language-plaintext highlighter-rouge">Imputer</code> should train using a single pass over the data</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-14371">SPARK-14371</a>] <code class="language-plaintext highlighter-rouge">OnlineLDAOptimizer</code> avoids collecting statistics to the driver for each mini-batch.</li>
</ul>
</li>
</ul>
</li>
</ul>
<p><em>Programming guide: <a href="/docs/2.3.0/ml-guide.html">Machine Learning Library (MLlib) Guide</a>.</em></p>
<h3 id="sparkr">SparkR</h3>
<p>The main focus of SparkR in the 2.3.0 release was towards improving the stability of UDFs and adding several new SparkR wrappers around existing APIs:</p>
<ul>
<li><strong>Major features</strong>
<ul>
<li>Improved function parity between SQL and R</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22933">SPARK-22933</a>]: Structured Streaming APIs for <code class="language-plaintext highlighter-rouge">withWatermark</code>, <code class="language-plaintext highlighter-rouge">trigger</code>, <code class="language-plaintext highlighter-rouge">partitionBy</code> and stream-stream joins</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-21266">SPARK-21266</a>]: SparkR UDF with DDL-formatted schema support</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-20726">SPARK-20726</a>][<a href="https://issues.apache.org/jira/browse/SPARK-22924">SPARK-22924</a>][<a href="https://issues.apache.org/jira/browse/SPARK-22843">SPARK-22843</a>] Several new Dataframe API Wrappers</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-15767">SPARK-15767</a>][<a href="https://issues.apache.org/jira/browse/SPARK-21622">SPARK-21622</a>][<a href="https://issues.apache.org/jira/browse/SPARK-20917">SPARK-20917</a>][<a href="https://issues.apache.org/jira/browse/SPARK-20307">SPARK-20307</a>][<a href="https://issues.apache.org/jira/browse/SPARK-20906">SPARK-20906</a>] Several new SparkML API Wrappers</li>
</ul>
</li>
</ul>
<p><em>Programming guide: <a href="/docs/2.3.0/sparkr.html">SparkR (R on Spark)</a>.</em></p>
<h3 id="graphx">GraphX</h3>
<ul>
<li><strong>Optimizations</strong>
<ul>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-5484">SPARK-5484</a>] Pregel now checkpoints periodically to avoid <code class="language-plaintext highlighter-rouge">StackOverflowErrors</code></li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-21491">SPARK-21491</a>] Small performance improvement in several places</li>
</ul>
</li>
</ul>
<p><em>Programming guide: <a href="/docs/2.3.0/graphx-programming-guide.html">GraphX Programming Guide</a>.</em></p>
<h3 id="deprecations">Deprecations</h3>
<ul>
<li><strong>Python</strong>
<ul>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-23122">SPARK-23122</a>] Deprecate <code class="language-plaintext highlighter-rouge">register*</code> for UDFs in <code class="language-plaintext highlighter-rouge">SQLContext</code> and <code class="language-plaintext highlighter-rouge">Catalog</code> in PySpark</li>
</ul>
</li>
<li><strong>MLlib</strong>
<ul>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-13030">SPARK-13030</a>] <code class="language-plaintext highlighter-rouge">OneHotEncoder</code> has been deprecated and will be removed in 3.0. It has been replaced by the new <code class="language-plaintext highlighter-rouge">OneHotEncoderEstimator</code>. Note that <code class="language-plaintext highlighter-rouge">OneHotEncoderEstimator</code> will be renamed to <code class="language-plaintext highlighter-rouge">OneHotEncoder</code> in 3.0 (but <code class="language-plaintext highlighter-rouge">OneHotEncoderEstimator</code> will be kept as an alias).</li>
</ul>
</li>
</ul>
<h3 id="changes-of-behavior">Changes of behavior</h3>
<ul>
<li><strong>SparkSQL</strong>
<ul>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22036">SPARK-22036</a>]: By default arithmetic operations between decimals return a rounded value if an exact representation is not possible (instead of returning <code class="language-plaintext highlighter-rouge">NULL</code> in the prior versions)</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22937">SPARK-22937</a>]: When all inputs are binary, SQL <code class="language-plaintext highlighter-rouge">elt()</code> returns an output as binary. Otherwise, it returns as a string. In the prior versions, it always returns as a string despite of input types.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22895">SPARK-22895</a>]: The Join/Filter&#8217;s deterministic predicates that are after the first non-deterministic predicates are also pushed down/through the child operators, if possible. In the prior versions, these filters were not eligible for predicate pushdown.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22771">SPARK-22771</a>]: When all inputs are binary, <code class="language-plaintext highlighter-rouge">functions.concat()</code> returns an output as binary. Otherwise, it returns as a string. In the prior versions, it always returns as a string despite of input types.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22489">SPARK-22489</a>]: When either of the join sides is broadcastable, we prefer to broadcasting the table that is explicitly specified in a broadcast hint.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22165">SPARK-22165</a>]: Partition column inference previously found incorrect common type for different inferred types, for example, previously it ended up with <code class="language-plaintext highlighter-rouge">double</code> type as the common type for <code class="language-plaintext highlighter-rouge">double</code> type and <code class="language-plaintext highlighter-rouge">date</code> type. Now it finds the correct common type for such conflicts. For details, see the <a href="/docs/2.3.0/sql-programming-guide.html#migration-guide">migration guide</a>.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22100">SPARK-22100</a>]: The <code class="language-plaintext highlighter-rouge">percentile_approx</code> function previously accepted <code class="language-plaintext highlighter-rouge">numeric</code> type input and outputted <code class="language-plaintext highlighter-rouge">double</code> type results. Now it supports <code class="language-plaintext highlighter-rouge">date</code> type, <code class="language-plaintext highlighter-rouge">timestamp</code> type and <code class="language-plaintext highlighter-rouge">numeric</code> types as input types. The result type is also changed to be the same as the input type, which is more reasonable for percentiles.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-21610">SPARK-21610</a>]: the queries from raw JSON/CSV files are disallowed when the referenced columns only include the internal corrupt record column (named <code class="language-plaintext highlighter-rouge">_corrupt_record</code> by default). Instead, you can cache or save the parsed results and then send the same query.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-23421">SPARK-23421</a>]: Since Spark 2.2.1 and 2.3.0, the schema is always inferred at runtime when the data source tables have the columns that exist in both partition schema and data schema. The inferred schema does not have the partitioned columns. When reading the table, Spark respects the partition values of these overlapping columns instead of the values stored in the data source files. In 2.2.0 and 2.1.x release, the inferred schema is partitioned but the data of the table is invisible to users (i.e., the result set is empty).</li>
</ul>
</li>
<li><strong>PySpark</strong>
<ul>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-19732">SPARK-19732</a>]: <code class="language-plaintext highlighter-rouge">na.fill()</code> or <code class="language-plaintext highlighter-rouge">fillna</code> also accepts boolean and replaces nulls with booleans. In prior Spark versions, PySpark just ignores it and returns the original Dataset/DataFrame.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22395">SPARK-22395</a>]: Pandas <code class="language-plaintext highlighter-rouge">0.19.2</code> or upper is required for using Pandas related functionalities, such as <code class="language-plaintext highlighter-rouge">toPandas</code>, <code class="language-plaintext highlighter-rouge">createDataFrame</code> from Pandas DataFrame, etc.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-22395">SPARK-22395</a>]: The behavior of timestamp values for Pandas related functionalities was changed to respect session timezone, which is ignored in the prior versions.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-23328">SPARK-23328</a>]: <code class="language-plaintext highlighter-rouge">df.replace</code> does not allow to omit <code class="language-plaintext highlighter-rouge">value</code> when <code class="language-plaintext highlighter-rouge">to_replace</code> is not a dictionary. Previously, <code class="language-plaintext highlighter-rouge">value</code> could be omitted in the other cases and had <code class="language-plaintext highlighter-rouge">None</code> by default, which is counter-intuitive and error prone.</li>
</ul>
</li>
<li><strong>MLlib</strong>
<ul>
<li><strong>Breaking API Changes</strong>: The class and trait hierarchy for logistic regression model summaries was changed to be cleaner and better accommodate the addition of the multi-class summary. This is a breaking change for user code that casts a <code class="language-plaintext highlighter-rouge">LogisticRegressionTrainingSummary</code> to a <code class="language-plaintext highlighter-rouge">BinaryLogisticRegressionTrainingSummary</code>. Users should instead use the <code class="language-plaintext highlighter-rouge">model.binarySummary</code> method. See [<a href="https://issues.apache.org/jira/browse/SPARK-17139">SPARK-17139</a>] for more detail (note this is an <code class="language-plaintext highlighter-rouge">@Experimental</code> API). This does not affect the Python summary method, which will still work correctly for both multinomial and binary cases.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-21806">SPARK-21806</a>]: <code class="language-plaintext highlighter-rouge">BinaryClassificationMetrics.pr()</code>: first point (0.0, 1.0) is misleading and has been replaced by (0.0, p) where precision p matches the lowest recall point.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-16957">SPARK-16957</a>]: Decision trees now use weighted midpoints when choosing split values. This may change results from model training.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-14657">SPARK-14657</a>]: <code class="language-plaintext highlighter-rouge">RFormula</code> without an intercept now outputs the reference category when encoding string terms, in order to match native R behavior. This may change results from model training.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-21027">SPARK-21027</a>]: The default parallelism used in <code class="language-plaintext highlighter-rouge">OneVsRest</code> is now set to 1 (i.e. serial). In 2.2 and earlier versions, the level of parallelism was set to the default threadpool size in Scala. This may change performance.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-21523">SPARK-21523</a>]: Upgraded Breeze to <code class="language-plaintext highlighter-rouge">0.13.2</code>. This included an important bug fix in strong Wolfe line search for L-BFGS.</li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-15526">SPARK-15526</a>]: The JPMML dependency is now shaded.</li>
<li>Also see the “Bug fixes” section for behavior changes resulting from fixing bugs.</li>
</ul>
</li>
</ul>
<h3 id="known-issues">Known Issues</h3>
<ul>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-23523">SPARK-23523</a>][SQL] Incorrect result caused by the rule <code class="language-plaintext highlighter-rouge">OptimizeMetadataOnlyQuery</code></li>
<li>[<a href="https://issues.apache.org/jira/browse/SPARK-23406">SPARK-23406</a>] Bugs in stream-stream self-joins</li>
</ul>
<h3 id="credits">Credits</h3>
<p>Last but not least, this release would not have been possible without the following contributors:
ALeksander Eskilson, Adrian Ionescu, Ajay Saini, Ala Luszczak, Albert Jang, Alberto Rodriguez De Lema, Alex Mikhailau, Alexander Istomin, Anderson Osagie, Andrea Zito, Andrew Ash, Andrew Korzhuev, Andrew Ray, Anirudh Ramanathan, Anton Okolnychyi, Arman Yazdani, Armin Braun, Arseniy Tashoyan, Arthur Rand, Atallah Hezbor, Attila Zsolt Piros, Ayush Singh, Bago Amirbekian, Ben Barnard, Bo Meng, Bo Xu, Bogdan Raducanu, Brad Kaiser, Bravo Zhang, Bruce Robbins, Bruce Xu, Bryan Cutler, Burak Yavuz, Carson Wang, Chang Chen, Charles Chen, Cheng Wang, Chenjun Zou, Chenzhao Guo, Chetan Khatri, Chie Hayashida, Chin Han Yu, Chunsheng Ji, Corey Woodfield, Daniel Li, Daniel Van Der Ende, Devaraj K, Dhruve Ashar, Dilip Biswal, Dmitry Parfenchik, Donghui Xu, Dongjoon Hyun, Eren Avsarogullari, Eric Vandenberg, Erik LaBianca, Eyal Farago, Favio Vazquez, Felix Cheung, Feng Liu, Feng Zhu, Fernando Pereira, Fokko Driesprong, Gabor Somogyi, Gene Pang, Gera Shegalov, German Schiavon, Glen Takahashi, Greg Owen, Grzegorz Slowikowski, Guilherme Berger, Guillaume Dardelet, Guo Xiao Long, He Qiao, Henry Robinson, Herman Van Hovell, Hideaki Tanaka, Holden Karau, Huang Tengfei, Huaxin Gao, Hyukjin Kwon, Ilya Matiach, Imran Rashid, Iurii Antykhovych, Ivan Sadikov, Jacek Laskowski, JackYangzg, Jakub Dubovsky, Jakub Nowacki, James Thompson, Jan Vrsovsky, Jane Wang, Jannik Arndt, Jason Taaffe, Jeff Zhang, Jen-Ming Chung, Jia Li, Jia-Xuan Liu, Jin Xing, Jinhua Fu, Jirka Kremser, Joachim Hereth, John Compitello, John Lee, John O&#8217;Leary, Jorge Machado, Jose Torres, Joseph K. Bradley, Josh Rosen, Juliusz Sompolski, Kalvin Chau, Kazuaki Ishizaki, Kent Yao, Kento NOZAWA, Kevin Yu, Kirby Linvill, Kohki Nishio, Kousuke Saruta, Kris Mok, Krishna Pandey, Kyle Kelley, Li Jin, Li Yichao, Li Yuanjian, Liang-Chi Hsieh, Lijia Liu, Liu Shaohui, Liu Xian, Liyun Zhang, Louis Lyu, Lubo Zhang, Luca Canali, Maciej Brynski, Maciej Szymkiewicz, Madhukara Phatak, Mahmut CAVDAR, Marcelo Vanzin, Marco Gaido, Marcos P, Marcos P. Sanchez, Mark Petruska, Maryann Xue, Masha Basmanova, Miao Wang, Michael Allman, Michael Armbrust, Michael Gummelt, Michael Mior, Michael Patterson, Michael Styles, Michal Senkyr, Mikhail Sveshnikov, Min Shen, Ming Jiang, Mingjie Tang, Mridul Muralidharan, Nan Zhu, Nathan Kronenfeld, Neil Alexander McQuarrie, Ngone51, Nicholas Chammas, Nick Pentreath, Ohad Raviv, Oleg Danilov, Onur Satici, PJ Fanning, Parth Gandhi, Patrick Woody, Paul Mackles, Peng Meng, Peng Xiao, Pengcheng Liu, Peter Szalai, Pralabh Kumar, Prashant Sharma, Rekha Joshi, Remis Haroon, Reynold Xin, Reza Safi, Riccardo Corbella, Rishabh Bhardwaj, Robert Kruszewski, Ron Hu, Ruben Berenguel Montoro, Ruben Janssen, Rui Zha, Rui Zhan, Ruifeng Zheng, Russell Spitzer, Ryan Blue, Sahil Takiar, Saisai Shao, Sameer Agarwal, Sandor Murakozi, Sanket Chintapalli, Santiago Saavedra, Sathiya Kumar, Sean Owen, Sergei Lebedev, Sergey Serebryakov, Sergey Zhemzhitsky, Seth Hendrickson, Shane Jarvie, Shashwat Anand, Shintaro Murakami, Shivaram Venkataraman, Shixiong Zhu, Shuangshuang Wang, Sid Murching, Sital Kedia, Soonmok Kwon, Srinivasa Reddy Vundela, Stavros Kontopoulos, Steve Loughran, Steven Rand, Sujith, Sujith Jay Nair, Sumedh Wale, Sunitha Kambhampati, Suresh Thalamati, Susan X. Huynh, Takeshi Yamamuro, Takuya UESHIN, Tathagata Das, Tejas Patil, Teng Peng, Thomas Graves, Tim Van Wassenhove, Travis Hegner, Tristan Stevens, Tucker Beck, Valeriy Avanesov, Vinitha Gankidi, Vinod KC, Wang Gengliang, Wayne Zhang, Weichen Xu, Wenchen Fan, Wieland Hoffmann, Wil Selwood, Wing Yew Poon, Xiang Gao, Xianjin YE, Xianyang Liu, Xiao Li, Xiaochen Ouyang, Xiaofeng Lin, Xiaokai Zhao, Xiayun Sun, Xin Lu, Xin Ren, Xingbo Jiang, Yan Facai (Yan Fa Cai), Yan Kit Li, Yanbo Liang, Yash Sharma, Yinan Li, Yong Tang, Youngbin Kim, Yuanjian Li, Yucai Yu, Yuhai Cen, Yuhao Yang, Yuming Wang, Yuval Itzchakov, Zhan Zhang, Zhang A Peng, Zhaokun Liu, Zheng RuiFeng, Zhenhua Wang, Zuo Tingbing, brandonJY, caneGuy, cxzl25, djvulee, eatoncys, heary-cao, ho3rexqj, lizhaoch, maclockard, neoremind, peay, shaofei007, wangjiaochun, zenglinxi0615</p>
<p>
<br/>
<a href="/news/">Spark News Archive</a>
</p>
</div>
<div class="col-12 col-md-3">
<div class="news" style="margin-bottom: 20px;">
<h5>Latest News</h5>
<ul class="list-unstyled">
<li><a href="/news/spark-3-4-3-released.html">Spark 3.4.3 released</a>
<span class="small">(Apr 18, 2024)</span></li>
<li><a href="/news/spark-3-5-1-released.html">Spark 3.5.1 released</a>
<span class="small">(Feb 23, 2024)</span></li>
<li><a href="/news/spark-3-3-4-released.html">Spark 3.3.4 released</a>
<span class="small">(Dec 16, 2023)</span></li>
<li><a href="/news/spark-3-4-2-released.html">Spark 3.4.2 released</a>
<span class="small">(Nov 30, 2023)</span></li>
</ul>
<p class="small" style="text-align: right;"><a href="/news/index.html">Archive</a></p>
</div>
<div style="text-align:center; margin-bottom: 20px;">
<a href="https://www.apache.org/events/current-event.html">
<img src="https://www.apache.org/events/current-event-234x60.png" style="max-width: 100%;"/>
</a>
</div>
<div class="hidden-xs hidden-sm">
<a href="/downloads.html" class="btn btn-cta btn-lg d-grid" style="margin-bottom: 30px;">
Download Spark
</a>
<p style="font-size: 16px; font-weight: 500; color: #555;">
Built-in Libraries:
</p>
<ul class="list-none">
<li><a href="/sql/">SQL and DataFrames</a></li>
<li><a href="/streaming/">Spark Streaming</a></li>
<li><a href="/mllib/">MLlib (machine learning)</a></li>
<li><a href="/graphx/">GraphX (graph)</a></li>
</ul>
<a href="/third-party-projects.html">Third-Party Projects</a>
</div>
</div>
</div>
<footer class="small">
<hr>
Apache Spark, Spark, Apache, the Apache feather logo, and the Apache Spark project logo are either registered
trademarks or trademarks of The Apache Software Foundation in the United States and other countries.
See guidance on use of Apache Spark <a href="/trademarks.html">trademarks</a>.
All other marks mentioned may be trademarks or registered trademarks of their respective owners.
Copyright &copy; 2018 The Apache Software Foundation, Licensed under the
<a href="https://www.apache.org/licenses/">Apache License, Version 2.0</a>.
</footer>
</div>
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/js/bootstrap.bundle.min.js"
integrity="sha384-MrcW6ZMFYlzcLA8Nl+NtUVF0sA7MsXsP1UyJoMp4YLEuNSfAP+JcXn/tWtIaxVXM"
crossorigin="anonymous"></script>
<script src="https://code.jquery.com/jquery.js"></script>
<script src="/js/lang-tabs.js"></script>
<script src="/js/downloads.js"></script>
</body>
</html>