blob: 60d3946d4747af648098145363b9e6004d222afe [file] [log] [blame]
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>
Spark Release 3.3.0 | Apache Spark
</title>
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet"
integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous">
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,wght@0,400;0,500;0,700;1,400;1,500;1,700&Courier+Prime:wght@400;700&display=swap" rel="stylesheet">
<link href="/css/custom.css" rel="stylesheet">
<!-- Code highlighter CSS -->
<link href="/css/pygments-default.css" rel="stylesheet">
<link rel="icon" href="/favicon.ico" type="image/x-icon">
<!-- Matomo -->
<script>
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body class="global">
<nav class="navbar navbar-expand-lg navbar-dark p-0 px-4" style="background: #1D6890;">
<a class="navbar-brand" href="/">
<img src="/images/spark-logo-rev.svg" alt="" width="141" height="72">
</a>
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarContent"
aria-controls="navbarContent" aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse col-md-12 col-lg-auto pt-4" id="navbarContent">
<ul class="navbar-nav me-auto">
<li class="nav-item">
<a class="nav-link active" aria-current="page" href="/downloads.html">Download</a>
</li>
<li class="nav-item dropdown">
<a class="nav-link dropdown-toggle" href="#" id="libraries" role="button" data-bs-toggle="dropdown"
aria-expanded="false">
Libraries
</a>
<ul class="dropdown-menu" aria-labelledby="libraries">
<li><a class="dropdown-item" href="/sql/">SQL and DataFrames</a></li>
<li><a class="dropdown-item" href="/spark-connect/">Spark Connect</a></li>
<li><a class="dropdown-item" href="/streaming/">Spark Streaming</a></li>
<li><a class="dropdown-item" href="/pandas-on-spark/">pandas on Spark</a></li>
<li><a class="dropdown-item" href="/mllib/">MLlib (machine learning)</a></li>
<li><a class="dropdown-item" href="/graphx/">GraphX (graph)</a></li>
<li>
<hr class="dropdown-divider">
</li>
<li><a class="dropdown-item" href="/third-party-projects.html">Third-Party Projects</a></li>
</ul>
</li>
<li class="nav-item dropdown">
<a class="nav-link dropdown-toggle" href="#" id="documentation" role="button" data-bs-toggle="dropdown"
aria-expanded="false">
Documentation
</a>
<ul class="dropdown-menu" aria-labelledby="documentation">
<li><a class="dropdown-item" href="/docs/latest/">Latest Release</a></li>
<li><a class="dropdown-item" href="/documentation.html">Older Versions and Other Resources</a></li>
<li><a class="dropdown-item" href="/faq.html">Frequently Asked Questions</a></li>
</ul>
</li>
<li class="nav-item">
<a class="nav-link active" aria-current="page" href="/examples.html">Examples</a>
</li>
<li class="nav-item dropdown">
<a class="nav-link dropdown-toggle" href="#" id="community" role="button" data-bs-toggle="dropdown"
aria-expanded="false">
Community
</a>
<ul class="dropdown-menu" aria-labelledby="community">
<li><a class="dropdown-item" href="/community.html">Mailing Lists &amp; Resources</a></li>
<li><a class="dropdown-item" href="/contributing.html">Contributing to Spark</a></li>
<li><a class="dropdown-item" href="/improvement-proposals.html">Improvement Proposals (SPIP)</a>
</li>
<li><a class="dropdown-item" href="https://issues.apache.org/jira/browse/SPARK">Issue Tracker</a>
</li>
<li><a class="dropdown-item" href="/powered-by.html">Powered By</a></li>
<li><a class="dropdown-item" href="/committers.html">Project Committers</a></li>
<li><a class="dropdown-item" href="/history.html">Project History</a></li>
</ul>
</li>
<li class="nav-item dropdown">
<a class="nav-link dropdown-toggle" href="#" id="developers" role="button" data-bs-toggle="dropdown"
aria-expanded="false">
Developers
</a>
<ul class="dropdown-menu" aria-labelledby="developers">
<li><a class="dropdown-item" href="/developer-tools.html">Useful Developer Tools</a></li>
<li><a class="dropdown-item" href="/versioning-policy.html">Versioning Policy</a></li>
<li><a class="dropdown-item" href="/release-process.html">Release Process</a></li>
<li><a class="dropdown-item" href="/security.html">Security</a></li>
</ul>
</li>
</ul>
<ul class="navbar-nav ml-auto">
<li class="nav-item dropdown">
<a class="nav-link dropdown-toggle" href="#" id="apacheFoundation" role="button"
data-bs-toggle="dropdown" aria-expanded="false">
Apache Software Foundation
</a>
<ul class="dropdown-menu" aria-labelledby="apacheFoundation">
<li><a class="dropdown-item" href="https://www.apache.org/">Apache Homepage</a></li>
<li><a class="dropdown-item" href="https://www.apache.org/licenses/">License</a></li>
<li><a class="dropdown-item"
href="https://www.apache.org/foundation/sponsorship.html">Sponsorship</a></li>
<li><a class="dropdown-item" href="https://www.apache.org/foundation/thanks.html">Thanks</a></li>
<li><a class="dropdown-item" href="https://www.apache.org/security/">Security</a></li>
<li><a class="dropdown-item" href="https://www.apache.org/events/current-event">Event</a></li>
</ul>
</li>
</ul>
</div>
</nav>
<div class="container">
<div class="row mt-4">
<div class="col-12 col-md-9">
<h2>Spark Release 3.3.0</h2>
<p>Apache Spark 3.3.0 is the fourth release of the 3.x line. With tremendous contribution from the open-source community, this release managed to resolve in excess of 1,600 Jira tickets.</p>
<p>This release improve join query performance via Bloom filters, increases the Pandas API coverage with the support of popular Pandas features such as datetime.timedelta and merge_asof, simplifies the migration from traditional data warehouses by improving ANSI compliance and supporting dozens of new built-in functions, boosts development productivity with better error handling, autocompletion, performance, and profiling.</p>
<p>To download Apache Spark 3.3.0, visit the <a href="https://spark.apache.org/downloads.html">downloads</a> page. You can consult JIRA for the <a href="https://s.apache.org/spark-3.3.0">detailed changes</a>. We have curated a list of high level changes here, grouped by major modules.</p>
<ul id="markdown-toc">
<li><a href="#highlight" id="markdown-toc-highlight">Highlight</a></li>
<li><a href="#spark-sql-and-core" id="markdown-toc-spark-sql-and-core">Spark SQL and Core</a> <ul>
<li><a href="#ansi-mode" id="markdown-toc-ansi-mode">ANSI mode</a></li>
<li><a href="#feature-enhancements" id="markdown-toc-feature-enhancements">Feature Enhancements</a></li>
<li><a href="#performance-enhancements" id="markdown-toc-performance-enhancements">Performance enhancements</a></li>
<li><a href="#built-in-connector-enhancements" id="markdown-toc-built-in-connector-enhancements">Built-in Connector Enhancements</a></li>
<li><a href="#data-source-v2-api" id="markdown-toc-data-source-v2-api">Data Source V2 API</a></li>
<li><a href="#kubernetes-enhancements" id="markdown-toc-kubernetes-enhancements">Kubernetes Enhancements</a></li>
<li><a href="#node-decommission" id="markdown-toc-node-decommission">Node Decommission</a></li>
<li><a href="#push-based-shuffle" id="markdown-toc-push-based-shuffle">Push-based shuffle</a></li>
<li><a href="#other-notable-changes" id="markdown-toc-other-notable-changes">Other Notable Changes</a></li>
</ul>
</li>
<li><a href="#structured-streaming" id="markdown-toc-structured-streaming">Structured Streaming</a> <ul>
<li><a href="#major-feature" id="markdown-toc-major-feature">Major feature</a></li>
<li><a href="#other-notable-changes-1" id="markdown-toc-other-notable-changes-1">Other Notable Changes</a></li>
</ul>
</li>
<li><a href="#pyspark" id="markdown-toc-pyspark">PySpark</a> <ul>
<li><a href="#pandas-api-on-spark" id="markdown-toc-pandas-api-on-spark">Pandas API on Spark</a></li>
<li><a href="#other-notable-changes-2" id="markdown-toc-other-notable-changes-2">Other Notable Changes</a></li>
</ul>
</li>
<li><a href="#mllib" id="markdown-toc-mllib">MLLIB</a></li>
<li><a href="#sparkr" id="markdown-toc-sparkr">SparkR</a></li>
<li><a href="#ui" id="markdown-toc-ui">UI</a></li>
<li><a href="#build" id="markdown-toc-build">Build</a></li>
<li><a href="#credits" id="markdown-toc-credits">Credits</a></li>
</ul>
<h3 id="highlight">Highlight</h3>
<ul>
<li>Row-level Runtime Filtering (<a href="https://issues.apache.org/jira/browse/SPARK-32268">SPARK-32268</a>)</li>
<li>ANSI enhancements (<a href="https://issues.apache.org/jira/browse/SPARK-38860">SPARK-38860</a>)</li>
<li>Error Message Improvements (<a href="https://issues.apache.org/jira/browse/SPARK-38781">SPARK-38781</a>)</li>
<li>Support complex types for Parquet vectorized reader (<a href="https://issues.apache.org/jira/browse/SPARK-34863">SPARK-34863</a>)</li>
<li>Hidden File Metadata Support for Spark SQL (<a href="https://issues.apache.org/jira/browse/SPARK-37273">SPARK-37273</a>)</li>
<li>Provide a profiler for Python/Pandas UDFs (<a href="https://issues.apache.org/jira/browse/SPARK-37443">SPARK-37443</a>)</li>
<li>Introduce Trigger.AvailableNow for running streaming queries like Trigger.Once in multiple batches (<a href="https://issues.apache.org/jira/browse/SPARK-36533">SPARK-36533</a>)</li>
<li>More comprehensive DS V2 push down capabilities (<a href="https://issues.apache.org/jira/browse/SPARK-38788">SPARK-38788</a>)</li>
<li>Executor Rolling in Kubernetes environment (<a href="https://issues.apache.org/jira/browse/SPARK-37810">SPARK-37810</a>)</li>
<li>Support Customized Kubernetes Schedulers ( <a href="https://issues.apache.org/jira/browse/SPARK-36057">SPARK-36057</a>)</li>
<li>Migrating from log4j 1 to log4j 2 (<a href="https://issues.apache.org/jira/browse/SPARK-37814">SPARK-37814</a>)</li>
</ul>
<h3 id="spark-sql-and-core">Spark SQL and Core</h3>
<h4 id="ansi-mode">ANSI mode</h4>
<ul>
<li>New explicit cast syntax rules in ANSI mode (<a href="https://issues.apache.org/jira/browse/SPARK-33354">SPARK-33354</a>)</li>
<li>Elt() should return null if index is null under ANSI mode (<a href="https://issues.apache.org/jira/browse/SPARK-38304">SPARK-38304</a>)</li>
<li>Optionally return null result if element not exists in array/map (<a href="https://issues.apache.org/jira/browse/SPARK-37750">SPARK-37750</a>)</li>
<li>Allow casting between numeric type and timestamp type (<a href="https://issues.apache.org/jira/browse/SPARK-37714">SPARK-37714</a>)</li>
<li>Disable ANSI reserved keywords by default (<a href="https://issues.apache.org/jira/browse/SPARK-37724">SPARK-37724</a>)</li>
<li>Use store assignment rules for resolving function invocation (<a href="https://issues.apache.org/jira/browse/SPARK-37438">SPARK-37438</a>)</li>
<li>Add a config to allow casting between Datetime and Numeric (<a href="https://issues.apache.org/jira/browse/SPARK-37179">SPARK-37179</a>)</li>
<li>Add a config to optionally enforce ANSI reserved keywords (<a href="https://issues.apache.org/jira/browse/SPARK-37133">SPARK-37133</a>)</li>
<li>Disallow binary operations between Interval and String literal (<a href="https://issues.apache.org/jira/browse/SPARK-36508">SPARK-36508</a>)</li>
</ul>
<h4 id="feature-enhancements">Feature Enhancements</h4>
<ul>
<li>Support ANSI SQL INTERVAL types (<a href="https://issues.apache.org/jira/browse/SPARK-27790">SPARK-27790</a>)</li>
<li>Error Message Improvements (<a href="https://issues.apache.org/jira/browse/SPARK-38781">SPARK-38781</a>)</li>
<li>Hidden File Metadata Support for Spark SQL (<a href="https://issues.apache.org/jira/browse/SPARK-37273">SPARK-37273</a>)</li>
<li>Support raw string literal (<a href="https://issues.apache.org/jira/browse/SPARK-36371">SPARK-36371</a>)</li>
<li>Helper class for batch Dataset.observe() (<a href="https://issues.apache.org/jira/browse/SPARK-34806">SPARK-34806</a>)</li>
<li>Support specify initial partition number for rebalance (<a href="https://issues.apache.org/jira/browse/SPARK-38410">SPARK-38410</a>)</li>
<li>Support cascade mode for <code class="language-plaintext highlighter-rouge">dropNamespace</code> API (<a href="https://issues.apache.org/jira/browse/SPARK-37929">SPARK-37929</a>)</li>
<li>Allow store assignment and implicit cast among datetime types (<a href="https://issues.apache.org/jira/browse/SPARK-37707">SPARK-37707</a>)</li>
<li>Collect, first and last should be deterministic aggregate functions (<a href="https://issues.apache.org/jira/browse/SPARK-32940">SPARK-32940</a>)</li>
<li>Add array support to union by name (<a href="https://issues.apache.org/jira/browse/SPARK-36546">SPARK-36546</a>)</li>
<li>Add df.withMetadata: a syntax sugar to update the metadata of a dataframe (<a href="https://issues.apache.org/jira/browse/SPARK-36642">SPARK-36642</a>)</li>
<li>Use CAST in parsing of dates/timestamps with default pattern (<a href="https://issues.apache.org/jira/browse/SPARK-36418">SPARK-36418</a>)</li>
<li>Support value class in nested schema for Dataset (<a href="https://issues.apache.org/jira/browse/SPARK-20384">SPARK-20384</a>)</li>
<li>Add AS OF syntax support (<a href="https://issues.apache.org/jira/browse/SPARK-37219">SPARK-37219</a>)</li>
<li>Add REPEATABLE in TABLESAMPLE to specify seed (<a href="https://issues.apache.org/jira/browse/SPARK-37165">SPARK-37165</a>)</li>
<li>Add ansi syntax <code class="language-plaintext highlighter-rouge">set catalog xxx</code> to change the current catalog (<a href="https://issues.apache.org/jira/browse/SPARK-36841">SPARK-36841</a>)</li>
<li>
<table>
<tbody>
<tr>
<td>Support ILIKE (ALL</td>
<td>ANY</td>
<td>SOME) - case insensitive LIKE (<a href="https://issues.apache.org/jira/browse/SPARK-36674">SPARK-36674</a>, <a href="https://issues.apache.org/jira/browse/SPARK-36736">SPARK-36736</a>, <a href="https://issues.apache.org/jira/browse/SPARK-36778">SPARK-36778</a>)</td>
</tr>
</tbody>
</table>
</li>
<li>Support query stage show runtime statistics in formatted explain mode (<a href="https://issues.apache.org/jira/browse/SPARK-38322">SPARK-38322</a>)</li>
<li>Add spill size metrics for sort merge join (<a href="https://issues.apache.org/jira/browse/SPARK-37726">SPARK-37726</a>)</li>
<li>Update the SQL syntax of SHOW FUNCTIONS (<a href="https://issues.apache.org/jira/browse/SPARK-37777">SPARK-37777</a>)</li>
<li>Storage Partitioned Join (<a href="https://issues.apache.org/jira/browse/SPARK-37375">SPARK-37375</a>)</li>
<li>Support DROP COLUMN [IF EXISTS] syntax (<a href="https://issues.apache.org/jira/browse/SPARK-38939">SPARK-38939</a>)</li>
<li>New built-in functions and their extensions (<a href="https://issues.apache.org/jira/browse/SPARK-38783">SPARK-38783</a>)
<ul>
<li>Datetime
<ul>
<li>Add the <span style="text-decoration:underline;">TIMESTAMPADD</span>() function (<a href="https://issues.apache.org/jira/browse/SPARK-38195">SPARK-38195</a>)</li>
<li>Add the <span style="text-decoration:underline;">TIMESTAMPDIFF</span>() function (<a href="https://issues.apache.org/jira/browse/SPARK-38284">SPARK-38284</a>)</li>
<li>Add the <span style="text-decoration:underline;">DATEDIFF</span>() alias for <code class="language-plaintext highlighter-rouge">TIMESTAMPDIFF()</code> (<a href="https://issues.apache.org/jira/browse/SPARK-38389">SPARK-38389</a>)</li>
<li>Add the <span style="text-decoration:underline;">DATEADD</span>() alias for <code class="language-plaintext highlighter-rouge">TIMESTAMPADD()</code> (<a href="https://issues.apache.org/jira/browse/SPARK-38332">SPARK-38332</a>)</li>
<li>Add the <span style="text-decoration:underline;">convert_timezone</span>() function (<a href="https://issues.apache.org/jira/browse/SPARK-37552">SPARK-37552</a>, <a href="https://issues.apache.org/jira/browse/SPARK-37568">SPARK-37568</a>)</li>
<li>Expose <span style="text-decoration:underline;">make_date</span> expression in functions.scala (<a href="https://issues.apache.org/jira/browse/SPARK-36554">SPARK-36554</a>)</li>
</ul>
</li>
<li>AES functions (<a href="https://issues.apache.org/jira/browse/SPARK-12567">SPARK-12567</a>)
<ul>
<li>Add <span style="text-decoration:underline;">aes_encrypt</span> and <span style="text-decoration:underline;">aes_decrypt</span> builtin functions (<a href="https://issues.apache.org/jira/browse/SPARK-12567">SPARK-12567</a>) <br />
Support the GCM mode by <span style="text-decoration:underline;">aes_encrypt</span>()/<span style="text-decoration:underline;">aes_decrypt</span>() (<a href="https://issues.apache.org/jira/browse/SPARK-37591">SPARK-37591</a>)</li>
<li>Set <code class="language-plaintext highlighter-rouge">GCM</code> as the default mode in <span style="text-decoration:underline;">aes_encrypt</span>()/<span style="text-decoration:underline;">aes_decrypt</span>() (<a href="https://issues.apache.org/jira/browse/SPARK-37666">SPARK-37666</a>)</li>
<li>Add the <code class="language-plaintext highlighter-rouge">mode</code> and <code class="language-plaintext highlighter-rouge">padding</code> args to <span style="text-decoration:underline;">aes_encrypt</span>()/<span style="text-decoration:underline;">aes_decrypt</span>() (<a href="https://issues.apache.org/jira/browse/SPARK-37586">SPARK-37586</a>)</li>
</ul>
</li>
<li>ANSI Aggregation Function (<a href="https://issues.apache.org/jira/browse/SPARK-37671">SPARK-37671</a>)
<ul>
<li>Support ANSI Aggregate Function: <span style="text-decoration:underline;">regr_count</span> (<a href="https://issues.apache.org/jira/browse/SPARK-37613">SPARK-37613</a>)</li>
<li>Support ANSI Aggregate Function: <span style="text-decoration:underline;">regr_avgx</span> &amp; <span style="text-decoration:underline;">regr_avgy</span> (<a href="https://issues.apache.org/jira/browse/SPARK-37614">SPARK-37614</a>)</li>
<li>Support ANSI Aggregate Function: <span style="text-decoration:underline;">regr_count</span> (<a href="https://issues.apache.org/jira/browse/SPARK-37613">SPARK-37613</a>)</li>
<li>Support ANSI Aggregate Function: regr_r2 (<a href="https://issues.apache.org/jira/browse/SPARK-37641">SPARK-37641</a>)</li>
<li>Support ANSI Aggregate Function: <span style="text-decoration:underline;">array_agg</span> (<a href="https://issues.apache.org/jira/browse/SPARK-27974">SPARK-27974</a>)</li>
<li>Support ANSI Aggregation Function: <span style="text-decoration:underline;">percentile_cont</span> (<a href="https://issues.apache.org/jira/browse/SPARK-37676">SPARK-37676</a>, <a href="https://issues.apache.org/jira/browse/SPARK-38219">SPARK-38219</a>)</li>
<li>Support ANSI Aggregation Function: <span style="text-decoration:underline;">percentile_disc</span> (<a href="https://issues.apache.org/jira/browse/SPARK-37691">SPARK-37691</a>)</li>
<li>New SQL function: try_avg (<a href="https://issues.apache.org/jira/browse/SPARK-38589">SPARK-38589</a>)</li>
</ul>
</li>
<li>Collections
<ul>
<li>Introduce SQL function <span style="text-decoration:underline;">ARRAY_SIZE</span> (<a href="https://issues.apache.org/jira/browse/SPARK-38345">SPARK-38345</a>)</li>
<li>New SQL function: <span style="text-decoration:underline;">map_contains_key</span> (<a href="https://issues.apache.org/jira/browse/SPARK-37584">SPARK-37584</a>)</li>
<li>New SQL function: <span style="text-decoration:underline;">try_element_at</span> (<a href="https://issues.apache.org/jira/browse/SPARK-37533">SPARK-37533</a>)</li>
<li>New SQL function: <span style="text-decoration:underline;">try_sum</span> (<a href="https://issues.apache.org/jira/browse/SPARK-38548">SPARK-38548</a>)</li>
</ul>
</li>
<li>Format
<ul>
<li>Add a new SQL function <span style="text-decoration:underline;">to_binary</span> (<a href="https://issues.apache.org/jira/browse/SPARK-37507">SPARK-37507</a>, <a href="https://issues.apache.org/jira/browse/SPARK-38796">SPARK-38796</a>)</li>
<li>New SQL function: <span style="text-decoration:underline;">try_to_binary</span> (<a href="https://issues.apache.org/jira/browse/SPARK-38590">SPARK-38590</a>, <a href="https://issues.apache.org/jira/browse/SPARK-38796">SPARK-38796</a>)</li>
<li>Data Type Formatting Functions: <span style="text-decoration:underline;">to_number</span> (<a href="https://issues.apache.org/jira/browse/SPARK-28137">SPARK-28137</a>)</li>
</ul>
</li>
<li>String/Binary
<ul>
<li>Add <span style="text-decoration:underline;">CONTAINS</span>() string function (<a href="https://issues.apache.org/jira/browse/SPARK-37508">SPARK-37508</a>)</li>
<li>Add the <span style="text-decoration:underline;">startswith</span>() and <span style="text-decoration:underline;">endswith</span>() string functions (<a href="https://issues.apache.org/jira/browse/SPARK-37520">SPARK-37520</a>)</li>
<li>Add lpad and rpad functions for binary strings (<a href="https://issues.apache.org/jira/browse/SPARK-37047">SPARK-37047</a>)</li>
<li>Support split_part Function (<a href="https://issues.apache.org/jira/browse/SPARK-38063">SPARK-38063</a>)</li>
</ul>
</li>
<li>Add scale parameter to <span style="text-decoration:underline;">floor</span> and <span style="text-decoration:underline;">ceil</span> functions (<a href="https://issues.apache.org/jira/browse/SPARK-37475">SPARK-37475</a>)</li>
<li>New SQL functions: <span style="text-decoration:underline;">try_subtract</span> and <span style="text-decoration:underline;">try_multiply</span> (<a href="https://issues.apache.org/jira/browse/SPARK-38164">SPARK-38164</a>)</li>
<li>Implements <span style="text-decoration:underline;">histogram_numeric</span> aggregation function which supports partial aggregation (<a href="https://issues.apache.org/jira/browse/SPARK-16280">SPARK-16280</a>)</li>
<li>Add max_by/min_by to sql.functions (<a href="https://issues.apache.org/jira/browse/SPARK-36963">SPARK-36963</a>)</li>
<li>Add new built-in SQL functions: SEC and CSC (<a href="https://issues.apache.org/jira/browse/SPARK-36683">SPARK-36683</a>)</li>
<li>array_intersect handles duplicated Double.NaN and Float.NaN (<a href="https://issues.apache.org/jira/browse/SPARK-36754">SPARK-36754</a>)</li>
<li>Add cot as Scala and Python functions (<a href="https://issues.apache.org/jira/browse/SPARK-36660">SPARK-36660</a>)</li>
</ul>
</li>
</ul>
<h4 id="performance-enhancements">Performance enhancements</h4>
<ul>
<li>Whole-stage code generation
<ul>
<li>Add code-gen for sort aggregate without grouping keys (<a href="https://issues.apache.org/jira/browse/SPARK-37564">SPARK-37564</a>)</li>
<li>Add code-gen for full outer sort merge join (<a href="https://issues.apache.org/jira/browse/SPARK-35352">SPARK-35352</a>)</li>
<li>Add code-gen for full outer shuffled hash join (<a href="https://issues.apache.org/jira/browse/SPARK-32567">SPARK-32567</a>)</li>
<li>Add code-gen for existence sort merge join (<a href="https://issues.apache.org/jira/browse/SPARK-37316">SPARK-37316</a>)</li>
</ul>
</li>
<li>Push down (filters)
<ul>
<li>Push down filters through RebalancePartitions (<a href="https://issues.apache.org/jira/browse/SPARK-37828">SPARK-37828</a>)</li>
<li>Push down boolean column filter (<a href="https://issues.apache.org/jira/browse/SPARK-36644">SPARK-36644</a>)</li>
<li>Push down limit 1 for right side of left semi/anti join if join condition is empty (<a href="https://issues.apache.org/jira/browse/SPARK-37917">SPARK-37917</a>)</li>
<li>Support propagate empty relation through aggregate/union (<a href="https://issues.apache.org/jira/browse/SPARK-35442">SPARK-35442</a>)</li>
<li>Row-level Runtime Filtering (<a href="https://issues.apache.org/jira/browse/SPARK-32268">SPARK-32268</a>)</li>
<li>Support Left Semi join in row level runtime filters (<a href="https://issues.apache.org/jira/browse/SPARK-38565">SPARK-38565</a>)</li>
<li>Support predicate pushdown and column pruning for de-duped CTEs (<a href="https://issues.apache.org/jira/browse/SPARK-37670">SPARK-37670</a>)</li>
</ul>
</li>
<li>Vectorization
<ul>
<li>Implement a ConstantColumnVector and improve performance of the hidden file metadata (<a href="https://issues.apache.org/jira/browse/SPARK-37896">SPARK-37896</a>)</li>
<li>Enable vectorized read for VectorizedPlainValuesReader.readBooleans (<a href="https://issues.apache.org/jira/browse/SPARK-35867">SPARK-35867</a>)</li>
</ul>
</li>
<li>Combine/remove/replace nodes
<ul>
<li>Combine unions if there is a project between them (<a href="https://issues.apache.org/jira/browse/SPARK-37915">SPARK-37915</a>)</li>
<li>Combine to one cast if we can safely up-cast two casts (<a href="https://issues.apache.org/jira/browse/SPARK-37922">SPARK-37922</a>)</li>
<li>Remove the Sort if it is the child of RepartitionByExpression (<a href="https://issues.apache.org/jira/browse/SPARK-36703">SPARK-36703</a>)</li>
<li>Removes outer join if it only has DISTINCT on streamed side with alias (<a href="https://issues.apache.org/jira/browse/SPARK-37292">SPARK-37292</a>)</li>
<li>Replace hash with sort aggregate if child is already sorted (<a href="https://issues.apache.org/jira/browse/SPARK-37455">SPARK-37455</a>)</li>
<li>Replace object hash with sort aggregate if child is already sorted (<a href="https://issues.apache.org/jira/browse/SPARK-37557">SPARK-37557</a>)</li>
<li>Only collapse projects if we don&#8217;t duplicate expensive expressions (<a href="https://issues.apache.org/jira/browse/SPARK-36718">SPARK-36718</a>)</li>
<li>Remove redundant aliases after RewritePredicateSubquery (<a href="https://issues.apache.org/jira/browse/SPARK-36280">SPARK-36280</a>)</li>
<li>Merge non-correlated scalar subqueries (<a href="https://issues.apache.org/jira/browse/SPARK-34079">SPARK-34079</a>)</li>
</ul>
</li>
<li>Partitioning
<ul>
<li>Do not add dynamic partition pruning if there exists static partition pruning (<a href="https://issues.apache.org/jira/browse/SPARK-38148">SPARK-38148</a>)</li>
<li>Improve RebalancePartitions in rules of Optimizer (<a href="https://issues.apache.org/jira/browse/SPARK-37904">SPARK-37904</a>)</li>
<li>Add small partition factor for rebalance partitions (<a href="https://issues.apache.org/jira/browse/SPARK-37357">SPARK-37357</a>)</li>
</ul>
</li>
<li>Join
<ul>
<li>Fine tune logic to demote Broadcast hash join in DynamicJoinSelection (<a href="https://issues.apache.org/jira/browse/SPARK-37753">SPARK-37753</a>)</li>
<li>Ignore duplicated join keys when building relation for SEMI/ANTI shuffled hash join (<a href="https://issues.apache.org/jira/browse/SPARK-36794">SPARK-36794</a>)</li>
<li>Support optimize skewed join even if introduce extra shuffle (<a href="https://issues.apache.org/jira/browse/SPARK-33832">SPARK-33832</a>)</li>
</ul>
</li>
<li>AQE
<ul>
<li>Support eliminate limits in AQE Optimizer (<a href="https://issues.apache.org/jira/browse/SPARK-36424">SPARK-36424</a>)</li>
<li>Optimize one row plan in normal and AQE Optimizer (<a href="https://issues.apache.org/jira/browse/SPARK-38162">SPARK-38162</a>)</li>
</ul>
</li>
<li>Aggregate.groupOnly support foldable expressions (<a href="https://issues.apache.org/jira/browse/SPARK-38489">SPARK-38489</a>)</li>
<li>ByteArrayMethods arrayEquals should fast skip the check of aligning with unaligned platform (<a href="https://issues.apache.org/jira/browse/SPARK-37796">SPARK-37796</a>)</li>
<li>Add tree pattern pruning to CTESubstitution rule (<a href="https://issues.apache.org/jira/browse/SPARK-37379">SPARK-37379</a>)</li>
<li>Add more Not operator simplifications (<a href="https://issues.apache.org/jira/browse/SPARK-36665">SPARK-36665</a>)</li>
<li>Support BooleanType in UnwrapCastInBinaryComparison (<a href="https://issues.apache.org/jira/browse/SPARK-36607">SPARK-36607</a>)</li>
<li>Coalesce drop all expressions after the first non nullable expression (<a href="https://issues.apache.org/jira/browse/SPARK-36359">SPARK-36359</a>)</li>
<li>Add a logical plan visitor to propagate the distinct attributes (<a href="https://issues.apache.org/jira/browse/SPARK-36194">SPARK-36194</a>)</li>
</ul>
<h4 id="built-in-connector-enhancements">Built-in Connector Enhancements</h4>
<ul>
<li>General
<ul>
<li>Lenient serialization of datetime from datasource (<a href="https://issues.apache.org/jira/browse/SPARK-38437">SPARK-38437</a>)</li>
<li>Treat table location as absolute when the first letter of its path is slash in create/alter table (<a href="https://issues.apache.org/jira/browse/SPARK-38236">SPARK-38236</a>)</li>
<li>Remove leading zeros from empty static number type partition (<a href="https://issues.apache.org/jira/browse/SPARK-35561">SPARK-35561</a>)</li>
<li>Support <code class="language-plaintext highlighter-rouge">ignoreCorruptFiles</code> and <code class="language-plaintext highlighter-rouge">ignoreMissingFiles</code> in Data Source options (<a href="https://issues.apache.org/jira/browse/SPARK-38767">SPARK-38767</a>)</li>
</ul>
</li>
<li>Parquet
<ul>
<li>Enable matching schema column names by field ids (<a href="https://issues.apache.org/jira/browse/SPARK-38094">SPARK-38094</a>)</li>
<li>Remove check field name when reading/writing data in parquet (<a href="https://issues.apache.org/jira/browse/SPARK-27442">SPARK-27442</a>)</li>
<li>Support vectorized read boolean values use RLE encoding with Parquet DataPage V2 (<a href="https://issues.apache.org/jira/browse/SPARK-37864">SPARK-37864</a>)</li>
<li>Support Parquet V2 data page encoding (DELTA_BINARY_PACKED) for the vectorized path (<a href="https://issues.apache.org/jira/browse/SPARK-36879">SPARK-36879</a>)</li>
<li>Rebase timestamps in the session time zone saved in Parquet/Avro metadata (<a href="https://issues.apache.org/jira/browse/SPARK-37705">SPARK-37705</a>)</li>
<li>Push down group by partition column for aggregate (<a href="https://issues.apache.org/jira/browse/SPARK-36646">SPARK-36646</a>)</li>
<li>Aggregate (Min/Max/Count) push down for Parquet (<a href="https://issues.apache.org/jira/browse/SPARK-36645">SPARK-36645</a>)</li>
<li>Reduce default page size by LONG_ARRAY_OFFSET if G1GC and ON_HEAP are used (<a href="https://issues.apache.org/jira/browse/SPARK-37593">SPARK-37593</a>)</li>
<li>Implement vectorized DELTA_BYTE_ARRAY and DELTA_LENGTH_BYTE_ARRAY encodings for Parquet V2 support (<a href="https://issues.apache.org/jira/browse/SPARK-37974">SPARK-37974</a>)</li>
<li>Support complex types for Parquet vectorized reader (<a href="https://issues.apache.org/jira/browse/SPARK-34863">SPARK-34863</a>)</li>
</ul>
</li>
<li>ORC
<ul>
<li>Remove check field name when reading/writing existing data in ORC (<a href="https://issues.apache.org/jira/browse/SPARK-37965">SPARK-37965</a>)</li>
<li>Aggregate push down for ORC (<a href="https://issues.apache.org/jira/browse/SPARK-34960">SPARK-34960</a>)</li>
<li>Support reading and writing ANSI intervals from/to ORC data sources (<a href="https://issues.apache.org/jira/browse/SPARK-36931">SPARK-36931</a>)</li>
<li>Support number-only column names in ORC data sources (<a href="https://issues.apache.org/jira/browse/SPARK-36663">SPARK-36663</a>)</li>
</ul>
</li>
<li>JSON
<ul>
<li>Respect allowNonNumericNumbers when parsing quoted NaN and Infinity values in JSON reader (<a href="https://issues.apache.org/jira/browse/SPARK-38060">SPARK-38060</a>)</li>
<li>Use CAST for datetime in CSV/JSON by default (<a href="https://issues.apache.org/jira/browse/SPARK-36536">SPARK-36536</a>)</li>
<li>Align error message for unsupported key types in MapType in Json reader (<a href="https://issues.apache.org/jira/browse/SPARK-35320">SPARK-35320</a>)</li>
<li>Add a legacy configuration for respecting nullability in DataFrame.schema.csv/json(ds) (<a href="https://issues.apache.org/jira/browse/SPARK-35912">SPARK-35912</a>)</li>
</ul>
</li>
<li>CSV
<ul>
<li>Fix referring to the corrupt record column from CSV (<a href="https://issues.apache.org/jira/browse/SPARK-38523">SPARK-38534</a>)</li>
<li>null values should be saved as nothing instead of quoted empty Strings &#8220;&#8221; by default (<a href="https://issues.apache.org/jira/browse/SPARK-37575">SPARK-37575</a>)</li>
<li>Fasten Timestamp type inference of default format in JSON/CSV data source (<a href="https://issues.apache.org/jira/browse/SPARK-39193">SPARK-39193</a>)</li>
</ul>
</li>
<li>JDBC
<ul>
<li>Support cascade mode for JDBC V2 (<a href="https://issues.apache.org/jira/browse/SPARK-37929">SPARK-37929</a>)</li>
<li>Add the IMMEDIATE statement to the DB2 dialect truncate implementation (<a href="https://issues.apache.org/jira/browse/SPARK-30062">SPARK-30062</a>)</li>
<li>Support aggregate functions of build-in JDBC dialect (<a href="https://issues.apache.org/jira/browse/SPARK-37867">SPARK-37867</a>)</li>
<li>Move compileAggregates from JDBCRDD to JdbcDialect (<a href="https://issues.apache.org/jira/browse/SPARK-37286">SPARK-37286</a>)</li>
<li>Implement dropIndex and listIndexes in JDBC (MySQL dialect) (<a href="https://issues.apache.org/jira/browse/SPARK-36914">SPARK-36914</a>)</li>
<li>Supports list namespaces in JDBC V2 MySQL dialect (<a href="https://issues.apache.org/jira/browse/SPARK-38054">SPARK-38054</a>)</li>
<li>Add factory method getConnection into JDBCDialect (<a href="https://issues.apache.org/jira/browse/SPARK-38361">SPARK-38361</a>)</li>
<li>Jdbc dialect should decide which function could be pushed down (<a href="https://issues.apache.org/jira/browse/SPARK-39162">SPARK-39162</a>)</li>
<li>Propagate correct JDBC properties in JDBC connector provider and add &#8220;connectionProvider&#8221; option (<a href="https://issues.apache.org/jira/browse/SPARK-36163">SPARK-36163</a>)</li>
<li>Refactor framework so as JDBC dialect could compile filter by self way (<a href="https://issues.apache.org/jira/browse/SPARK-38432">SPARK-38432</a>)</li>
<li>Reactor framework so as JDBC dialect could compile expression by itself (<a href="https://issues.apache.org/jira/browse/SPARK-38196">SPARK-38196</a>)</li>
<li>Implement createIndex and IndexExists in DS V2 JDBC (MySQL dialect) (<a href="https://issues.apache.org/jira/browse/SPARK-36913">SPARK-36913</a>)</li>
</ul>
</li>
<li>Hive
<ul>
<li>Support writing Hive bucketed table (Parquet/ORC format with Hive hash) (<a href="https://issues.apache.org/jira/browse/SPARK-32709">SPARK-32709</a>)</li>
<li>Support writing Hive bucketed table (Hive file formats with Hive hash) (<a href="https://issues.apache.org/jira/browse/SPARK-32712">SPARK-32712</a>)</li>
<li>Use expressions to filter Hive partitions at client side (<a href="https://issues.apache.org/jira/browse/SPARK-35437">SPARK-35437</a>)</li>
<li>Support Dynamic Partition pruning for HiveTableScanExec (<a href="https://issues.apache.org/jira/browse/SPARK-36876">SPARK-36876</a>)</li>
<li>InsertIntoHiveDir should use data source if it&#8217;s convertible (<a href="https://issues.apache.org/jira/browse/SPARK-38215">SPARK-38215</a>)</li>
</ul>
</li>
</ul>
<h4 id="data-source-v2-api">Data Source V2 API</h4>
<ul>
<li>New interfaces
<ul>
<li>Introduce a new DataSource V2 interface HasPartitionKey (<a href="https://issues.apache.org/jira/browse/SPARK-37376">SPARK-37376</a>)</li>
<li>Add interface SupportsPushDownV2Filters (<a href="https://issues.apache.org/jira/browse/SPARK-36760">SPARK-36760</a>)</li>
<li>Support DataSource V2 CreateTempViewUsing (<a href="https://issues.apache.org/jira/browse/SPARK-35803">SPARK-35803</a>)</li>
<li>Add a class to represent general aggregate functions in DS V2 (<a href="https://issues.apache.org/jira/browse/SPARK-37789">SPARK-37789</a>)</li>
<li>A new framework to represent catalyst expressions in DS V2 APIs (<a href="https://issues.apache.org/jira/browse/SPARK-37960">SPARK-37960</a>)</li>
<li>Add APIs for group-based row-level operations (<a href="https://issues.apache.org/jira/browse/SPARK-38625">SPARK-38625</a>)</li>
</ul>
</li>
<li>Migrate commands
<ul>
<li>Migrate SHOW CREATE TABLE to use V2 command by default (<a href="https://issues.apache.org/jira/browse/SPARK-37878">SPARK-37878</a>)</li>
<li>Migrate CREATE NAMESPACE to use V2 command by default (<a href="https://issues.apache.org/jira/browse/SPARK-37636">SPARK-37636</a>)</li>
<li>Migrate DESCRIBE NAMESPACE to use V2 command by default (<a href="https://issues.apache.org/jira/browse/SPARK-37150">SPARK-37150</a>)</li>
</ul>
</li>
<li>Indexing
<ul>
<li>Support drop index for Data Source V2 (<a href="https://issues.apache.org/jira/browse/SPARK-37200">SPARK-37200</a>)</li>
<li>Add Create Index syntax support (<a href="https://issues.apache.org/jira/browse/SPARK-36895">SPARK-36895</a>)</li>
<li>DS V2 Index Support: Add supportsIndex interface (<a href="https://issues.apache.org/jira/browse/SPARK-36526">SPARK-36526</a>)</li>
</ul>
</li>
<li>Push down (<a href="https://issues.apache.org/jira/browse/SPARK-38788">SPARK-38788</a>)
<ul>
<li>Add DS V2 filters (<a href="https://issues.apache.org/jira/browse/SPARK-36556">SPARK-36556</a>)</li>
<li>Push down boolean column filter for Data Source V2 (<a href="https://issues.apache.org/jira/browse/SPARK-36644">SPARK-36644</a>)</li>
<li>Support push down top N to JDBC data source V2 (<a href="https://issues.apache.org/jira/browse/SPARK-37483">SPARK-37483</a>)</li>
<li>DS V2 Sample Push Down (<a href="https://issues.apache.org/jira/browse/SPARK-37038">SPARK-37038</a>)</li>
<li>DS V2 LIMIT push down (<a href="https://issues.apache.org/jira/browse/SPARK-37020">SPARK-37020</a>)</li>
<li>DS V2 supports partial aggregate push-down <code class="language-plaintext highlighter-rouge">AVG</code> (<a href="https://issues.apache.org/jira/browse/SPARK-37839">SPARK-37839</a>)</li>
<li>Support datasource V2 complete aggregate pushdown (<a href="https://issues.apache.org/jira/browse/SPARK-37644">SPARK-37644</a>)</li>
<li>If <code class="language-plaintext highlighter-rouge">Sum</code>, <code class="language-plaintext highlighter-rouge">Count</code>, <code class="language-plaintext highlighter-rouge">Any</code> accompany distinct, cannot do partial agg push down (<a href="https://issues.apache.org/jira/browse/SPARK-38560">SPARK-38560</a>)</li>
<li>Translate more standard aggregate functions for pushdown (<a href="https://issues.apache.org/jira/browse/SPARK-37527">SPARK-37527</a>)</li>
<li>DS V2 aggregate push-down supports project with alias (<a href="https://issues.apache.org/jira/browse/SPARK-38533">SPARK-38533</a>)</li>
<li>DS V2 topN push-down supports project with alias (<a href="https://issues.apache.org/jira/browse/SPARK-38644">SPARK-38644</a>)</li>
<li>DS V2 Top N push-down supports order by expressions (<a href="https://issues.apache.org/jira/browse/SPARK-39037">SPARK-39037</a>)</li>
<li>Datasource V2 supports partial topN push-down (<a href="https://issues.apache.org/jira/browse/SPARK-38391">SPARK-38391</a>)</li>
<li>Support push down Cast to JDBC data source V2 (<a href="https://issues.apache.org/jira/browse/SPARK-38633">SPARK-38633</a>)</li>
<li>Remove <code class="language-plaintext highlighter-rouge">Limit</code> from plan if complete push down limit to data source (<a href="https://issues.apache.org/jira/browse/SPARK-38768">SPARK-38768</a>)</li>
<li>DS V2 supports push down misc non-aggregate functions (<a href="https://issues.apache.org/jira/browse/SPARK-38761">SPARK-38761</a>)</li>
<li>DS V2 supports push down math functions (<a href="https://issues.apache.org/jira/browse/SPARK-38855">SPARK-38855</a>)</li>
<li>DS V2 aggregate push-down supports group by expressions (<a href="https://issues.apache.org/jira/browse/SPARK-38997">SPARK-38997</a>)</li>
<li>DS V2 aggregate partial push-down should supports group by without aggregate functions (<a href="https://issues.apache.org/jira/browse/SPARK-39135">SPARK-39135</a>)</li>
</ul>
</li>
<li>Support nested columns in ORC vectorized reader for data source V2 (<a href="https://issues.apache.org/jira/browse/SPARK-36404">SPARK-36404</a>)</li>
<li>Update task metrics from ds V2 custom metrics (<a href="https://issues.apache.org/jira/browse/SPARK-37578">SPARK-37578</a>)</li>
<li>Unify V1 and V2 options output of <code class="language-plaintext highlighter-rouge">SHOW CREATE TABLE</code> command (<a href="https://issues.apache.org/jira/browse/SPARK-37494">SPARK-37494</a>)</li>
<li>Add command <code class="language-plaintext highlighter-rouge">SHOW CATALOGS</code> (<a href="https://issues.apache.org/jira/browse/SPARK-35973">SPARK-35973</a>)</li>
</ul>
<h4 id="kubernetes-enhancements">Kubernetes Enhancements</h4>
<ul>
<li>Executor Rolling in Kubernetes environment (<a href="https://issues.apache.org/jira/browse/SPARK-37810">SPARK-37810</a>)</li>
<li>Support Customized Kubernetes Schedulers ( <a href="https://issues.apache.org/jira/browse/SPARK-36057">SPARK-36057</a>)</li>
<li>executorIdleTimeout is not working for pending pods on K8s (<a href="https://issues.apache.org/jira/browse/SPARK-37049">SPARK-37049</a>)</li>
<li>Upgrade kubernetes-client to 5.12.2 (<a href="https://issues.apache.org/jira/browse/SPARK-38817">SPARK-38817</a>)</li>
<li>Make memory overhead factor configurable (<a href="https://issues.apache.org/jira/browse/SPARK-38194">SPARK-38194</a>)</li>
<li>Add Volcano build-in integration and PodGroup template support for Spark on Kubernetes (experimental). (<a href="https://issues.apache.org/jira/browse/SPARK-36061">SPARK-36061,</a> <a href="https://issues.apache.org/jira/browse/SPARK-38455">SPARK-38455</a>)</li>
<li>Add KubernetesCustom[Driver/Executor]FeatureConfigStep developer API (<a href="https://issues.apache.org/jira/browse/SPARK-37145">SPARK-37145</a>)</li>
</ul>
<h4 id="node-decommission">Node Decommission</h4>
<ul>
<li>FallbackStorage shouldn’t attempt to resolve arbitrary “remote” hostname (<a href="https://issues.apache.org/jira/browse/SPARK-38062">SPARK-38062</a>)</li>
<li>ExecutorMonitor.onExecutorRemoved should handle ExecutorDecommission as finished (<a href="https://issues.apache.org/jira/browse/SPARK-38023">SPARK-38023</a>)</li>
</ul>
<h4 id="push-based-shuffle">Push-based shuffle</h4>
<ul>
<li>Adaptive shuffle merge finalization for push-based shuffle (<a href="https://issues.apache.org/jira/browse/SPARK-33701">SPARK-33701</a>)</li>
<li>Adaptive fetch of shuffle mergers for Push based shuffle (<a href="https://issues.apache.org/jira/browse/SPARK-34826">SPARK-34826</a>)</li>
<li>Skip diagnosis ob merged blocks from push-based shuffle (<a href="https://issues.apache.org/jira/browse/SPARK-37695">SPARK-37695</a>)</li>
<li>PushBlockStreamCallback should check isTooLate first to avoid NPE (<a href="https://issues.apache.org/jira/browse/SPARK-37847">SPARK-37847</a>)</li>
<li>Push-based merge finalization bugs in the RemoteBlockPushResolver (<a href="https://issues.apache.org/jira/browse/SPARK-37675">SPARK-37675</a>)</li>
<li>Avoid fetching merge status when shuffleMergeEnabled is false for a shuffleDependency during retry (<a href="https://issues.apache.org/jira/browse/SPARK-37023">SPARK-37023</a>)</li>
</ul>
<h4 id="other-notable-changes">Other Notable Changes</h4>
<ul>
<li>Add fine grained locking to BlockInfoManager (<a href="https://issues.apache.org/jira/browse/SPARK-37356">SPARK-37356</a>)</li>
<li>Support mapping Spark gpu/fpga resource types to custom YARN resource type (<a href="https://issues.apache.org/jira/browse/SPARK-37208">SPARK-37208</a>)</li>
<li>Report accurate shuffle block size if its skewed (<a href="https://issues.apache.org/jira/browse/SPARK-36967">SPARK-36967</a>)</li>
<li>Supporting Netty Logging at the network layer (<a href="https://issues.apache.org/jira/browse/SPARK-36719">SPARK-36719</a>)</li>
</ul>
<h3 id="structured-streaming">Structured Streaming</h3>
<h4 id="major-feature">Major feature</h4>
<ul>
<li>Introduce Trigger.AvailableNow for running streaming queries like Trigger.Once in multiple batches (<a href="https://issues.apache.org/jira/browse/SPARK-36533">SPARK-36533</a>)</li>
</ul>
<h4 id="other-notable-changes-1">Other Notable Changes</h4>
<ul>
<li>Use StatefulOpClusteredDistribution for stateful operators with respecting backward compatibility (<a href="https://issues.apache.org/jira/browse/SPARK-38204">SPARK-38204</a>)</li>
<li>Fix flatMapGroupsWithState timeout in batch with data for key (<a href="https://issues.apache.org/jira/browse/SPARK-38320">SPARK-38320</a>)</li>
<li>Fix correctness issue on stream-stream outer join with RocksDB state store provider (<a href="https://issues.apache.org/jira/browse/SPARK-38684">SPARK-38684</a>)</li>
<li>Upgrade Kafka to 3.1.0 (<a href="https://issues.apache.org/jira/browse/SPARK-36837">SPARK-36837</a>)</li>
<li>Support Trigger.AvailableNow on Kafka data source (<a href="https://issues.apache.org/jira/browse/SPARK-36649">SPARK-36649</a>)</li>
<li>Optimize write path on RocksDB state store provider (<a href="https://issues.apache.org/jira/browse/SPARK-37224">SPARK-37224</a>)</li>
<li>Introduce a new data source for providing consistent set of rows per microbatch (<a href="https://issues.apache.org/jira/browse/SPARK-37062">SPARK-37062</a>)</li>
<li>Use HashClusteredDistribution for stateful operators with respecting backward compatibility (<a href="https://issues.apache.org/jira/browse/SPARK-38204">SPARK-38204</a>)</li>
<li>Make foreachBatch streaming query stop gracefully (<a href="https://issues.apache.org/jira/browse/SPARK-39218">SPARK-39218</a>)</li>
</ul>
<h3 id="pyspark">PySpark</h3>
<h4 id="pandas-api-on-spark">Pandas API on Spark</h4>
<ul>
<li>Major improvement
<ul>
<li>&#8216;distributed-sequence&#8217; index optimization with being <strong>default</strong> (<a href="https://issues.apache.org/jira/browse/SPARK-37649">SPARK-37649</a>, <a href="https://issues.apache.org/jira/browse/SPARK-36559">SPARK-36559</a>, <a href="https://issues.apache.org/jira/browse/SPARK-36338">SPARK-36338</a>)</li>
<li>Support to specify index type and name in pandas API on Spark (<a href="https://issues.apache.org/jira/browse/SPARK-36709">SPARK-36709</a>)</li>
<li>Show default index type in SQL plans for pandas API on Spark (<a href="https://issues.apache.org/jira/browse/SPARK-38654">SPARK-38654</a>)</li>
</ul>
</li>
<li>Major feature
<ul>
<li>Implement SparkSQL native ps.<strong>merge_asof</strong> (<a href="https://issues.apache.org/jira/browse/SPARK-36813">SPARK-36813</a>)</li>
<li>Support TimedeltaIndex in pandas API on Spark (<a href="https://issues.apache.org/jira/browse/SPARK-37525">SPARK-37525</a>)</li>
<li>Support Python’s <strong>timedelta</strong> (<a href="https://issues.apache.org/jira/browse/SPARK-37275">SPARK-37275</a>, <a href="https://issues.apache.org/jira/browse/SPARK-37510">SPARK-37510</a>)</li>
<li>Implement functions in CategoricalAccessor/CategoricalIndex (<a href="https://issues.apache.org/jira/browse/SPARK-36185">SPARK-36185</a>)</li>
<li>Uses Python&#8217;s standard string formatter for SQL API in pandas API on Spark (<a href="https://issues.apache.org/jira/browse/SPARK-37436">SPARK-37436</a>)</li>
<li>Support basic operations of timedelta Series/Index (<a href="https://issues.apache.org/jira/browse/SPARK-37510">SPARK-37510</a>)</li>
<li>Support ps.MultiIndex.dtypes (<a href="https://issues.apache.org/jira/browse/SPARK-36930">SPARK-36930</a>)</li>
<li>Implement Index.map (<a href="https://issues.apache.org/jira/browse/SPARK-36469">SPARK-36469</a>)</li>
<li>Implement Series.<strong>xor</strong> and Series.<strong>rxor</strong> (<a href="https://issues.apache.org/jira/browse/SPARK-36653">SPARK-36653</a>)</li>
<li>Implement unary operator <code class="language-plaintext highlighter-rouge">invert</code> of integral ps.Series/Index (<a href="https://issues.apache.org/jira/browse/SPARK-36003">SPARK-36003</a>)</li>
<li>Implement DataFrame.cov (<a href="https://issues.apache.org/jira/browse/SPARK-36396">SPARK-36396</a>)</li>
<li>
<table>
<tbody>
<tr>
<td>Support str and timestamp for (Series</td>
<td>DataFrame).describe() (<a href="https://issues.apache.org/jira/browse/SPARK-37657">SPARK-37657</a>)</td>
</tr>
</tbody>
</table>
</li>
<li>Support lambda <code class="language-plaintext highlighter-rouge">column</code> parameter of <code class="language-plaintext highlighter-rouge">DataFrame.rename</code>(<a href="https://issues.apache.org/jira/browse/SPARK-38763">SPARK-38763</a>)</li>
</ul>
</li>
</ul>
<h4 id="other-notable-changes-2">Other Notable Changes</h4>
<ul>
<li>Breaking changes
<ul>
<li>Drop references to Python 3.6 support in docs and python/docs (<a href="https://issues.apache.org/jira/browse/SPARK-36977">SPARK-36977</a>)</li>
<li>Remove namedtuple hack by replacing built-in pickle to cloudpickle (<a href="https://issues.apache.org/jira/browse/SPARK-32079">SPARK-32079</a>)</li>
<li>Bump minimum pandas version to 1.0.5 (<a href="https://issues.apache.org/jira/browse/SPARK-37465">SPARK-37465</a>)</li>
</ul>
</li>
<li>Major improvements
<ul>
<li>Provide a <strong>profiler</strong> for Python/Pandas UDFs (<a href="https://issues.apache.org/jira/browse/SPARK-37443">SPARK-37443</a>)</li>
<li>Uses Python&#8217;s standard string formatter for SQL API in PySpark (<a href="https://issues.apache.org/jira/browse/SPARK-37516">SPARK-37516</a>)</li>
<li>Expose SQL state and error class in PySpark exceptions (<a href="https://issues.apache.org/jira/browse/SPARK-36953">SPARK-36953</a>)</li>
<li>Try to capture faulthanlder when a Python worker crashes (<a href="https://issues.apache.org/jira/browse/SPARK-36062">SPARK-36062</a>)</li>
</ul>
</li>
<li>Major feature
<ul>
<li>Implement DataFrame.<strong>mapInArrow</strong> in Python (<a href="https://issues.apache.org/jira/browse/SPARK-37228">SPARK-37228</a>)</li>
<li>Uses Python&#8217;s standard string formatter for SQL API in PySpark (<a href="https://issues.apache.org/jira/browse/SPARK-37516">SPARK-37516</a>)</li>
<li>Add df.withMetadata pyspark API (<a href="https://issues.apache.org/jira/browse/SPARK-36642">SPARK-36642</a>)</li>
<li>Support Python’s timedelta (<a href="https://issues.apache.org/jira/browse/SPARK-37275">SPARK-37275</a>)</li>
<li>Expose tableExists in pyspark.sql.catalog (<a href="https://issues.apache.org/jira/browse/SPARK-36176">SPARK-36176</a>)</li>
<li>Expose databaseExists in pyspark.sql.catalog (<a href="https://issues.apache.org/jira/browse/SPARK-36207">SPARK-36207</a>)</li>
<li>Exposing functionExists in pyspark sql catalog (<a href="https://issues.apache.org/jira/browse/SPARK-36258">SPARK-36258</a>)</li>
<li>Add Dataframe.observation to PySpark (<a href="https://issues.apache.org/jira/browse/SPARK-36263">SPARK-36263</a>)</li>
<li>Add max_by/min_by API to PySpark (<a href="https://issues.apache.org/jira/browse/SPARK-36972">SPARK-36972</a>)</li>
<li>Support to infer nested dict as a struct when creating a DataFrame (<a href="https://issues.apache.org/jira/browse/SPARK-35929">SPARK-35929</a>)</li>
<li>Add bit/octet_length APIs to Scala, Python and R (<a href="https://issues.apache.org/jira/browse/SPARK-36751">SPARK-36751</a>)</li>
<li>Support ILIKE API on Python (<a href="https://issues.apache.org/jira/browse/SPARK-36882">SPARK-36882</a>)</li>
<li>Add isEmpty method for the Python DataFrame API (<a href="https://issues.apache.org/jira/browse/SPARK-37207">SPARK-37207</a>)</li>
<li>Add multiple columns adding support (<a href="https://issues.apache.org/jira/browse/SPARK-35173">SPARK-35173</a>)</li>
<li>Add SparkContext.addArchive in PySpark (<a href="https://issues.apache.org/jira/browse/SPARK-38278">SPARK-38278</a>)</li>
<li>Make sql type reprs eval-able (<a href="https://issues.apache.org/jira/browse/SPARK-18621">SPARK-18621</a>)</li>
<li>Inline type hints for fpm.py in python/pyspark/mllib (<a href="https://issues.apache.org/jira/browse/SPARK-37396">SPARK-37396</a>)</li>
<li>Implement <code class="language-plaintext highlighter-rouge">dropna</code> parameter of <code class="language-plaintext highlighter-rouge">SeriesGroupBy.value_counts</code> (<a href="https://issues.apache.org/jira/browse/SPARK-38837">SPARK-38837</a>)</li>
</ul>
</li>
</ul>
<h3 id="mllib">MLLIB</h3>
<ul>
<li>Major feature
<ul>
<li>Add distanceMeasure param to trainKMeansModel (<a href="https://issues.apache.org/jira/browse/SPARK-37118">SPARK-37118</a>)</li>
<li>Expose LogisticRegression.setInitialModel, like KMeans et al do (<a href="https://issues.apache.org/jira/browse/SPARK-36481">SPARK-36481</a>)</li>
<li>Support CrossValidatorModel get standard deviation of metrics for each paramMap (<a href="https://issues.apache.org/jira/browse/SPARK-36425">SPARK-36425</a>)</li>
</ul>
</li>
<li>Major improvements
<ul>
<li>Optimize some treeAggregates in MLlib by delaying allocations (<a href="https://issues.apache.org/jira/browse/SPARK-35848">SPARK-35848</a>)</li>
<li>Rewrite _shared_params_code_gen.py to inline type hints for ml/param/shared.py (<a href="https://issues.apache.org/jira/browse/SPARK-37419">SPARK-37419</a>)</li>
</ul>
</li>
<li>Other Notable Changes
<ul>
<li>Update to breeze 1.2 (<a href="https://issues.apache.org/jira/browse/SPARK-35310">SPARK-35310</a>)</li>
</ul>
</li>
</ul>
<h3 id="sparkr">SparkR</h3>
<ul>
<li>Migrate SparkR docs to pkgdown (<a href="https://issues.apache.org/jira/browse/SPARK-37474">SPARK-37474</a>)</li>
<li>Expose make_date expression in R (<a href="https://issues.apache.org/jira/browse/SPARK-37108">SPARK-37108</a>)</li>
<li>Add max_by/min_by API to SparkR (<a href="https://issues.apache.org/jira/browse/SPARK-36976">SPARK-36976</a>)</li>
<li>Support ILIKE API on R (<a href="https://issues.apache.org/jira/browse/SPARK-36899">SPARK-36899</a>)</li>
<li>Add sec and csc as R functions (<a href="https://issues.apache.org/jira/browse/SPARK-36824">SPARK-36824</a>)</li>
<li>Add bit/octet_length APIs to Scala, Python and R (<a href="https://issues.apache.org/jira/browse/SPARK-36751">SPARK-36751</a>)</li>
<li>Add cot as an R function (<a href="https://issues.apache.org/jira/browse/SPARK-36688">SPARK-36688</a>)</li>
</ul>
<h3 id="ui">UI</h3>
<ul>
<li>Speculation metrics summary at stage level (<a href="https://issues.apache.org/jira/browse/SPARK-36038">SPARK-36038</a>)</li>
<li>Unified shuffle read block time to shuffle read fetch wait time in StagePage (<a href="https://issues.apache.org/jira/browse/SPARK-37469">SPARK-37469</a>)</li>
<li>Add modified configs for SQL execution in UI (<a href="https://issues.apache.org/jira/browse/SPARK-34735">SPARK-34735</a>)</li>
<li>Make ThriftServer recognize spark.sql.redaction.string.regex (<a href="https://issues.apache.org/jira/browse/SPARK-36400">SPARK-36400</a>)</li>
<li>Attach and start handler after application started in UI (<a href="https://issues.apache.org/jira/browse/SPARK-36237">SPARK-36237</a>)</li>
<li>Add commit duration to SQL tab&#8217;s graph node (<a href="https://issues.apache.org/jira/browse/SPARK-34399">SPARK-34399</a>)</li>
<li>Support RocksDB backend in Spark History Server (<a href="https://issues.apache.org/jira/browse/SPARK-37680">SPARK-37680</a>)</li>
<li>Show options for Pandas API on Spark in UI (<a href="https://issues.apache.org/jira/browse/SPARK-38656">SPARK-38656</a>)</li>
<li>Rename &#8216;SQL&#8217; to &#8216;SQL / DataFrame&#8217; in SQL UI page (<a href="https://issues.apache.org/jira/browse/SPARK-38657">SPARK-38657</a>)</li>
</ul>
<h3 id="build">Build</h3>
<ul>
<li>Build and Run Spark on Java 17 (<a href="https://issues.apache.org/jira/browse/SPARK-33772">SPARK-33772</a>)</li>
<li>Migrating from log4j 1 to log4j 2 (<a href="https://issues.apache.org/jira/browse/SPARK-37814">SPARK-37814</a>)</li>
<li>Upgrade log4j2 to 2.17.2 (<a href="https://issues.apache.org/jira/browse/SPARK-38544">SPARK-38544</a>)</li>
<li>Spark on Apple Silicon (<a href="https://issues.apache.org/jira/browse/SPARK-35781">SPARK-35781</a>)</li>
<li>Upgrade to Py4J 0.10.9.5 (<a href="https://issues.apache.org/jira/browse/SPARK-38563">SPARK-38563</a>)</li>
<li>Upgrade jackson due to CVE-2020-36518 (<a href="https://issues.apache.org/jira/browse/SPARK-38665">SPARK-38665</a>)</li>
<li>Upgrade Jackson to 2.13.3 (<a href="https://issues.apache.org/jira/browse/SPARK-39250">SPARK-39250</a>)</li>
<li>Update ORC to 1.7.4 (<a href="https://issues.apache.org/jira/browse/SPARK-38866">SPARK-38866</a>)</li>
<li>Update datatables to 1.10.25 (<a href="https://issues.apache.org/jira/browse/SPARK-38924">SPARK-38924</a>)</li>
<li>Upgrade Jetty to 9.4.46 (<a href="https://issues.apache.org/jira/browse/SPARK-38784">SPARK-38784</a>)</li>
<li>Upgrade h2 from 1.4.195 to 2.0.202 (<a href="https://issues.apache.org/jira/browse/SPARK-37734">SPARK-37734</a>)</li>
<li>Upgrade Apache Xerces Java to 2.12.2 (<a href="https://issues.apache.org/jira/browse/SPARK-39183">SPARK-39183</a>)</li>
</ul>
<h3 id="credits">Credits</h3>
<p>Last but not least, this release would not have been possible without the following contributors: Abhishek Somani, Adam Binford, Alex Balikov, Alex Ott, Alfonso Buono, Allison Wang, Almog Tavor, Amin Borjian, Andrew Liu, Andrew Olson, Andy Grove, Angerszhuuuu, Anish Shrigondekar, Ankur Dave, Anton Okolnychyi, Aravind Patnam, Attila Zsolt Piros, BOOTMGR, BelodengKlaus, Bessenyei Balázs Donát, Bjørn Jørgensen, Bo Zhang, Brian Fallik, Brian Yue, Bruce Robbins, Byron, Cary Lee, Cedric-Magnan, Chandni Singh, Chao Sun, Cheng Pan, Cheng Su, Chia-Ping Tsai, Chilaka Ramakrishna, Daniel Dai, Daniel Davies, Daniel Tenedorio, Daniel-Davies, Danny Guinther, Darek, David Christle, Denis Tarima, Dereck Li, Devesh Agrawal, Dhiren Navani, Diego Luis, Dmitriy Fishman, Dmytro Melnychenko, Dominik Gehl, Dongjoon Hyun, Emil Ejbyfeldt, Enrico Minack, Erik Krogen, Eugene Koifman, Fabian A.J. Thiele, Franck Thang, Fu Chen, Geek, Gengliang Wang, Gidon Gershinsky, H. Vetinari, Haejoon Lee, Harutaka Kawamura, Herman van Hovell, Holden Karau, Huaxin Gao, Hyukjin Kwon, Igor Dvorzhak, IonutBoicuAms, Itay Bittan, Ivan Karol, Ivan Sadikov, Jackey Lee, Jerry Peng, Jiaan Geng, Jie, Johan Nystrom, Josh Rosen, Junfan Zhang, Jungtaek Lim, Kamel Gazzaz, Karen Feng, Karthik Subramanian, Kazuyuki Tanimura, Ke Jia, Keith Holliday, Keith Massey, Kent Yao, Kevin Sewell, Kevin Su, Kevin Wallimann, Koert Kuipers, Kousuke Saruta, Kun Wan, Lei Peng, Leona, Leona Yoda, Liang Zhang, Liang-Chi Hsieh, Linhong Liu, Lorenzo Martini, Luca Canali, Ludovic Henry, Lukas Rytz, Luran He, Maciej Szymkiewicz, Manu Zhang, Martin Tzvetanov Grigorov, Maryann Xue, Matthew Jones, Max Gekk, Menelaos Karavelas, Michael Chen, Michał Słapek, Mick Jermsurawong, Microsoft Learn Student, Min Shen, Minchu Yang, Ming Li, Mohamadreza Rostami, Mridul Muralidharan, Nicholas Chammas, Nicolas Azrak, Ole Sasse, Pablo Langa, Parth Chandra, PengLei, Peter Toth, Philipp Dallig, Prashant Singh, Qian.Sun, RabbidHY, Radek Busz, Rahul Mahadev, Richard Chen, Rob Reeves, Robert (Bobby) Evans, RoryQi, Rui Wang, Ruifeng Zheng, Russell Spitzer, Sachin Tripathi, Sajith Ariyarathna, Samuel Moseley, Samuel Souza, Sathiya KUMAR, SaurabhChawla, Sean Owen, Senthil Kumar, Serge Rielau, Shardul Mahadik, Shixiong Zhu, Shockang, Shruti Gumma, Simeon Simeonov, Steve Loughran, Steven Aerts, Takuya UESHIN, Ted Yu, Tengfei Huang, Terry Kim, Thejdeep Gudivada, Thomas Graves, Tim Armstrong, Tom van Bussel, Tomas Pereira de Vasconcelos, TongWeii, Utkarsh, Vasily Malakhin, Venkata Sai Akhil Gudesa, Venkata krishnan Sowrirajan, Venki Korukanti, Vitalii Li, Wang, Warren Zhu, Weichen Xu, Weiwei Yang, Wenchen Fan, William Hyun, Wu, Xiaochang, Xianjin YE, Xiduo You, Xingbo Jiang, Xinrong Meng, Xinyi Yu, XiuLi Wei, Yang He, Yang Liu, YangJie, Yannis Sismanis, Ye Zhou, Yesheng Ma, Yihong He, Yikf, Yikun Jiang, Yimin, Yingyi Bu, Yuanjian Li, Yufei Gu, Yuming Wang, Yun Tang, Yuto Akutsu, Zhen Li, Zhenhua Wang, Zimo Li, alexander_holmes, beobest2, bjornjorgensen, chenzhx, copperybean, daugraph, dch nguyen, dchvn, dchvn nguyen, dgd-contributor, dgd_contributor, dohongdayi, erenavsarogullari, fhygh, flynn, gaoyajun02, gengjiaan, herman, hi-zir, huangmaoyang2, huaxingao, hujiahua, jackierwzhang, jackylee-ch, jiaoqb, jinhai, khalidmammadov, kuwii, leesf, mans2singh, mcdull-zhang, michaelzhang-db, minyyy, nyingping, pralabhkumar, qitao liu, remykarem, sandeepvinayak, senthilkumarb, shane knapp, skhandrikagmail, sperlingxx, sudoliyang, sweisdb, sychen, tan.vu, tanel.kiis@gmail.com, tenglei, tianhanhu, tianlzhang, timothy65535, tooptoop4, vadim, w00507315, wangguangxin.cn, wangshengjie3, wayneguow, wooplevip, wuyi, xiepengjie, xuyu, yangjie01, yaohua, yi.wu, yikaifei, yoda-mon, zhangxudong1, zhoubin11, zhouyifan279, zhuqi-lucas, zwangsheng</p>
<p>
<br/>
<a href="/news/">Spark News Archive</a>
</p>
</div>
<div class="col-12 col-md-3">
<div class="news" style="margin-bottom: 20px;">
<h5>Latest News</h5>
<ul class="list-unstyled">
<li><a href="/news/spark-3-4-3-released.html">Spark 3.4.3 released</a>
<span class="small">(Apr 18, 2024)</span></li>
<li><a href="/news/spark-3-5-1-released.html">Spark 3.5.1 released</a>
<span class="small">(Feb 23, 2024)</span></li>
<li><a href="/news/spark-3-3-4-released.html">Spark 3.3.4 released</a>
<span class="small">(Dec 16, 2023)</span></li>
<li><a href="/news/spark-3-4-2-released.html">Spark 3.4.2 released</a>
<span class="small">(Nov 30, 2023)</span></li>
</ul>
<p class="small" style="text-align: right;"><a href="/news/index.html">Archive</a></p>
</div>
<div style="text-align:center; margin-bottom: 20px;">
<a href="https://www.apache.org/events/current-event.html">
<img src="https://www.apache.org/events/current-event-234x60.png" style="max-width: 100%;"/>
</a>
</div>
<div class="hidden-xs hidden-sm">
<a href="/downloads.html" class="btn btn-cta btn-lg d-grid" style="margin-bottom: 30px;">
Download Spark
</a>
<p style="font-size: 16px; font-weight: 500; color: #555;">
Built-in Libraries:
</p>
<ul class="list-none">
<li><a href="/sql/">SQL and DataFrames</a></li>
<li><a href="/streaming/">Spark Streaming</a></li>
<li><a href="/mllib/">MLlib (machine learning)</a></li>
<li><a href="/graphx/">GraphX (graph)</a></li>
</ul>
<a href="/third-party-projects.html">Third-Party Projects</a>
</div>
</div>
</div>
<footer class="small">
<hr>
Apache Spark, Spark, Apache, the Apache feather logo, and the Apache Spark project logo are either registered
trademarks or trademarks of The Apache Software Foundation in the United States and other countries.
See guidance on use of Apache Spark <a href="/trademarks.html">trademarks</a>.
All other marks mentioned may be trademarks or registered trademarks of their respective owners.
Copyright &copy; 2018 The Apache Software Foundation, Licensed under the
<a href="https://www.apache.org/licenses/">Apache License, Version 2.0</a>.
</footer>
</div>
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/js/bootstrap.bundle.min.js"
integrity="sha384-MrcW6ZMFYlzcLA8Nl+NtUVF0sA7MsXsP1UyJoMp4YLEuNSfAP+JcXn/tWtIaxVXM"
crossorigin="anonymous"></script>
<script src="https://code.jquery.com/jquery.js"></script>
<script src="/js/lang-tabs.js"></script>
<script src="/js/downloads.js"></script>
</body>
</html>