blob: 9dbdbafdc385936d5818c3322eaab6977a131b4d [file] [log] [blame]
<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<title>Migration Guide: SQL, Datasets and DataFrame - Spark 3.2.4 Documentation</title>
<link rel="stylesheet" href="css/bootstrap.min.css">
<style>
body {
padding-top: 60px;
padding-bottom: 40px;
}
</style>
<meta name="viewport" content="width=device-width">
<link rel="stylesheet" href="css/main.css">
<script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
<link rel="stylesheet" href="css/pygments-default.css">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.css" />
<link rel="stylesheet" href="css/docsearch.css">
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body>
<!--[if lt IE 7]>
<p class="chromeframe">You are using an outdated browser. <a href="https://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
<![endif]-->
<!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
<nav class="navbar fixed-top navbar-expand-md navbar-light bg-light" id="topbar">
<div class="container">
<div class="navbar-header">
<div class="navbar-brand"><a href="index.html">
<img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">3.2.4</span>
</div>
</div>
<button class="navbar-toggler" type="button" data-toggle="collapse"
data-target="#navbarCollapse" aria-controls="navbarCollapse"
aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarCollapse">
<ul class="navbar-nav">
<!--TODO(andyk): Add class="active" attribute to li some how.-->
<li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a>
<div class="dropdown-menu" aria-labelledby="navbarQuickStart">
<a class="dropdown-item" href="quick-start.html">Quick Start</a>
<a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a>
<a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a>
<a class="dropdown-item" href="structured-streaming-programming-guide.html">Structured Streaming</a>
<a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a>
<a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a>
<a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a>
<a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a>
<a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a>
<div class="dropdown-menu" aria-labelledby="navbarAPIDocs">
<a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a>
<a class="dropdown-item" href="api/java/index.html">Java</a>
<a class="dropdown-item" href="api/python/index.html">Python</a>
<a class="dropdown-item" href="api/R/index.html">R</a>
<a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a>
<div class="dropdown-menu" aria-labelledby="navbarDeploying">
<a class="dropdown-item" href="cluster-overview.html">Overview</a>
<a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a>
<a class="dropdown-item" href="running-on-mesos.html">Mesos</a>
<a class="dropdown-item" href="running-on-yarn.html">YARN</a>
<a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a>
<div class="dropdown-menu" aria-labelledby="navbarMore">
<a class="dropdown-item" href="configuration.html">Configuration</a>
<a class="dropdown-item" href="monitoring.html">Monitoring</a>
<a class="dropdown-item" href="tuning.html">Tuning Guide</a>
<a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a>
<a class="dropdown-item" href="security.html">Security</a>
<a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a>
<a class="dropdown-item" href="migration-guide.html">Migration Guide</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="building-spark.html">Building Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a>
</div>
</li>
<li class="nav-item">
<input type="text" id="docsearch-input" placeholder="Search the docs…">
</li>
</ul>
<!--<span class="navbar-text navbar-right"><span class="version-text">v3.2.4</span></span>-->
</div>
</div>
</nav>
<div class="container-wrapper">
<div class="left-menu-wrapper">
<div class="left-menu">
<h3><a href="migration-guide.html">Migration Guide</a></h3>
<ul>
<li>
<a href="core-migration-guide.html">
Spark Core
</a>
</li>
<li>
<a href="sql-migration-guide.html">
<b>SQL, Datasets and DataFrame</b>
</a>
</li>
<li>
<a href="ss-migration-guide.html">
Structured Streaming
</a>
</li>
<li>
<a href="ml-migration-guide.html">
MLlib (Machine Learning)
</a>
</li>
<li>
<a href="pyspark-migration-guide.html">
PySpark (Python on Spark)
</a>
</li>
<li>
<a href="sparkr-migration-guide.html">
SparkR (R on Spark)
</a>
</li>
</ul>
</div>
</div>
<input id="nav-trigger" class="nav-trigger" checked type="checkbox">
<label for="nav-trigger"></label>
<div class="content-with-sidebar mr-3" id="content">
<h1 class="title">Migration Guide: SQL, Datasets and DataFrame</h1>
<ul id="markdown-toc">
<li><a href="#upgrading-from-spark-sql-31-to-32" id="markdown-toc-upgrading-from-spark-sql-31-to-32">Upgrading from Spark SQL 3.1 to 3.2</a></li>
<li><a href="#upgrading-from-spark-sql-30-to-31" id="markdown-toc-upgrading-from-spark-sql-30-to-31">Upgrading from Spark SQL 3.0 to 3.1</a></li>
<li><a href="#upgrading-from-spark-sql-301-to-302" id="markdown-toc-upgrading-from-spark-sql-301-to-302">Upgrading from Spark SQL 3.0.1 to 3.0.2</a></li>
<li><a href="#upgrading-from-spark-sql-30-to-301" id="markdown-toc-upgrading-from-spark-sql-30-to-301">Upgrading from Spark SQL 3.0 to 3.0.1</a></li>
<li><a href="#upgrading-from-spark-sql-24-to-30" id="markdown-toc-upgrading-from-spark-sql-24-to-30">Upgrading from Spark SQL 2.4 to 3.0</a> <ul>
<li><a href="#datasetdataframe-apis" id="markdown-toc-datasetdataframe-apis">Dataset/DataFrame APIs</a></li>
<li><a href="#ddl-statements" id="markdown-toc-ddl-statements">DDL Statements</a></li>
<li><a href="#udfs-and-built-in-functions" id="markdown-toc-udfs-and-built-in-functions">UDFs and Built-in Functions</a></li>
<li><a href="#query-engine" id="markdown-toc-query-engine">Query Engine</a></li>
<li><a href="#data-sources" id="markdown-toc-data-sources">Data Sources</a></li>
<li><a href="#others" id="markdown-toc-others">Others</a></li>
</ul>
</li>
<li><a href="#upgrading-from-spark-sql-247-to-248" id="markdown-toc-upgrading-from-spark-sql-247-to-248">Upgrading from Spark SQL 2.4.7 to 2.4.8</a></li>
<li><a href="#upgrading-from-spark-sql-245-to-246" id="markdown-toc-upgrading-from-spark-sql-245-to-246">Upgrading from Spark SQL 2.4.5 to 2.4.6</a></li>
<li><a href="#upgrading-from-spark-sql-244-to-245" id="markdown-toc-upgrading-from-spark-sql-244-to-245">Upgrading from Spark SQL 2.4.4 to 2.4.5</a></li>
<li><a href="#upgrading-from-spark-sql-243-to-244" id="markdown-toc-upgrading-from-spark-sql-243-to-244">Upgrading from Spark SQL 2.4.3 to 2.4.4</a></li>
<li><a href="#upgrading-from-spark-sql-24-to-241" id="markdown-toc-upgrading-from-spark-sql-24-to-241">Upgrading from Spark SQL 2.4 to 2.4.1</a></li>
<li><a href="#upgrading-from-spark-sql-23-to-24" id="markdown-toc-upgrading-from-spark-sql-23-to-24">Upgrading from Spark SQL 2.3 to 2.4</a></li>
<li><a href="#upgrading-from-spark-sql-22-to-23" id="markdown-toc-upgrading-from-spark-sql-22-to-23">Upgrading from Spark SQL 2.2 to 2.3</a></li>
<li><a href="#upgrading-from-spark-sql-21-to-22" id="markdown-toc-upgrading-from-spark-sql-21-to-22">Upgrading from Spark SQL 2.1 to 2.2</a></li>
<li><a href="#upgrading-from-spark-sql-20-to-21" id="markdown-toc-upgrading-from-spark-sql-20-to-21">Upgrading from Spark SQL 2.0 to 2.1</a></li>
<li><a href="#upgrading-from-spark-sql-16-to-20" id="markdown-toc-upgrading-from-spark-sql-16-to-20">Upgrading from Spark SQL 1.6 to 2.0</a></li>
<li><a href="#upgrading-from-spark-sql-15-to-16" id="markdown-toc-upgrading-from-spark-sql-15-to-16">Upgrading from Spark SQL 1.5 to 1.6</a></li>
<li><a href="#upgrading-from-spark-sql-14-to-15" id="markdown-toc-upgrading-from-spark-sql-14-to-15">Upgrading from Spark SQL 1.4 to 1.5</a></li>
<li><a href="#upgrading-from-spark-sql-13-to-14" id="markdown-toc-upgrading-from-spark-sql-13-to-14">Upgrading from Spark SQL 1.3 to 1.4</a></li>
<li><a href="#upgrading-from-spark-sql-10-12-to-13" id="markdown-toc-upgrading-from-spark-sql-10-12-to-13">Upgrading from Spark SQL 1.0-1.2 to 1.3</a></li>
<li><a href="#compatibility-with-apache-hive" id="markdown-toc-compatibility-with-apache-hive">Compatibility with Apache Hive</a></li>
</ul>
<h2 id="upgrading-from-spark-sql-31-to-32">Upgrading from Spark SQL 3.1 to 3.2</h2>
<ul>
<li>
<p>Since Spark 3.2, ADD FILE/JAR/ARCHIVE commands require each path to be enclosed by <code class="language-plaintext highlighter-rouge">"</code> or <code class="language-plaintext highlighter-rouge">'</code> if the path contains whitespaces.</p>
</li>
<li>
<p>Since Spark 3.2, all the supported JDBC dialects use StringType for ROWID. In Spark 3.1 or earlier, Oracle dialect uses StringType and the other dialects use LongType.</p>
</li>
<li>
<p>In Spark 3.2, PostgreSQL JDBC dialect uses StringType for MONEY and MONEY[] is not supported due to the JDBC driver for PostgreSQL can&#8217;t handle those types properly. In Spark 3.1 or earlier, DoubleType and ArrayType of DoubleType are used respectively.</p>
</li>
<li>
<p>In Spark 3.2, <code class="language-plaintext highlighter-rouge">spark.sql.adaptive.enabled</code> is enabled by default. To restore the behavior before Spark 3.2, you can set <code class="language-plaintext highlighter-rouge">spark.sql.adaptive.enabled</code> to <code class="language-plaintext highlighter-rouge">false</code>.</p>
</li>
<li>In Spark 3.2, the following meta-characters are escaped in the <code class="language-plaintext highlighter-rouge">show()</code> action. In Spark 3.1 or earlier, the following metacharacters are output as it is.
<ul>
<li><code class="language-plaintext highlighter-rouge">\n</code> (new line)</li>
<li><code class="language-plaintext highlighter-rouge">\r</code> (carrige ret)</li>
<li><code class="language-plaintext highlighter-rouge">\t</code> (horizontal tab)</li>
<li><code class="language-plaintext highlighter-rouge">\f</code> (form feed)</li>
<li><code class="language-plaintext highlighter-rouge">\b</code> (backspace)</li>
<li><code class="language-plaintext highlighter-rouge">\u000B</code> (vertical tab)</li>
<li><code class="language-plaintext highlighter-rouge">\u0007</code> (bell)</li>
</ul>
</li>
<li>
<p>In Spark 3.2, <code class="language-plaintext highlighter-rouge">ALTER TABLE .. RENAME TO PARTITION</code> throws <code class="language-plaintext highlighter-rouge">PartitionAlreadyExistsException</code> instead of <code class="language-plaintext highlighter-rouge">AnalysisException</code> for tables from Hive external when the target partition already exists.</p>
</li>
<li>
<p>In Spark 3.2, script transform default FIELD DELIMIT is <code class="language-plaintext highlighter-rouge">\u0001</code> for no serde mode, serde property <code class="language-plaintext highlighter-rouge">field.delim</code> is <code class="language-plaintext highlighter-rouge">\t</code> for Hive serde mode when user specifies serde. In Spark 3.1 or earlier, the default FIELD DELIMIT is <code class="language-plaintext highlighter-rouge">\t</code>, serde property <code class="language-plaintext highlighter-rouge">field.delim</code> is <code class="language-plaintext highlighter-rouge">\u0001</code> for Hive serde mode when user specifies serde.</p>
</li>
<li>
<p>In Spark 3.2, the auto-generated <code class="language-plaintext highlighter-rouge">Cast</code> (such as those added by type coercion rules) will be stripped when generating column alias names. E.g., <code class="language-plaintext highlighter-rouge">sql("SELECT floor(1)").columns</code> will be <code class="language-plaintext highlighter-rouge">FLOOR(1)</code> instead of <code class="language-plaintext highlighter-rouge">FLOOR(CAST(1 AS DOUBLE))</code>.</p>
</li>
<li>
<p>In Spark 3.2, the output schema of <code class="language-plaintext highlighter-rouge">SHOW TABLES</code> becomes <code class="language-plaintext highlighter-rouge">namespace: string, tableName: string, isTemporary: boolean</code>. In Spark 3.1 or earlier, the <code class="language-plaintext highlighter-rouge">namespace</code> field was named <code class="language-plaintext highlighter-rouge">database</code> for the builtin catalog, and there is no <code class="language-plaintext highlighter-rouge">isTemporary</code> field for v2 catalogs. To restore the old schema with the builtin catalog, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.keepCommandOutputSchema</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.2, the output schema of <code class="language-plaintext highlighter-rouge">SHOW TABLE EXTENDED</code> becomes <code class="language-plaintext highlighter-rouge">namespace: string, tableName: string, isTemporary: boolean, information: string</code>. In Spark 3.1 or earlier, the <code class="language-plaintext highlighter-rouge">namespace</code> field was named <code class="language-plaintext highlighter-rouge">database</code> for the builtin catalog, and no change for the v2 catalogs. To restore the old schema with the builtin catalog, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.keepCommandOutputSchema</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.2, the output schema of <code class="language-plaintext highlighter-rouge">SHOW TBLPROPERTIES</code> becomes <code class="language-plaintext highlighter-rouge">key: string, value: string</code> whether you specify the table property key or not. In Spark 3.1 and earlier, the output schema of <code class="language-plaintext highlighter-rouge">SHOW TBLPROPERTIES</code> is <code class="language-plaintext highlighter-rouge">value: string</code> when you specify the table property key. To restore the old schema with the builtin catalog, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.keepCommandOutputSchema</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.2, the output schema of <code class="language-plaintext highlighter-rouge">DESCRIBE NAMESPACE</code> becomes <code class="language-plaintext highlighter-rouge">info_name: string, info_value: string</code>. In Spark 3.1 or earlier, the <code class="language-plaintext highlighter-rouge">info_name</code> field was named <code class="language-plaintext highlighter-rouge">database_description_item</code> and the <code class="language-plaintext highlighter-rouge">info_value</code> field was named <code class="language-plaintext highlighter-rouge">database_description_value</code> for the builtin catalog. To restore the old schema with the builtin catalog, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.keepCommandOutputSchema</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>In Spark 3.2, table refreshing clears cached data of the table as well as of all its dependents such as views while keeping the dependents cached. The following commands perform table refreshing:
<ul>
<li><code class="language-plaintext highlighter-rouge">ALTER TABLE .. ADD PARTITION</code></li>
<li><code class="language-plaintext highlighter-rouge">ALTER TABLE .. RENAME PARTITION</code></li>
<li><code class="language-plaintext highlighter-rouge">ALTER TABLE .. DROP PARTITION</code></li>
<li><code class="language-plaintext highlighter-rouge">ALTER TABLE .. RECOVER PARTITIONS</code></li>
<li><code class="language-plaintext highlighter-rouge">MSCK REPAIR TABLE</code></li>
<li><code class="language-plaintext highlighter-rouge">LOAD DATA</code></li>
<li><code class="language-plaintext highlighter-rouge">REFRESH TABLE</code></li>
<li><code class="language-plaintext highlighter-rouge">TRUNCATE TABLE</code></li>
<li>and the method <code class="language-plaintext highlighter-rouge">spark.catalog.refreshTable</code>
In Spark 3.1 and earlier, table refreshing leaves dependents uncached.</li>
</ul>
</li>
<li>
<p>In Spark 3.2, the usage of <code class="language-plaintext highlighter-rouge">count(tblName.*)</code> is blocked to avoid producing ambiguous results. Because <code class="language-plaintext highlighter-rouge">count(*)</code> and <code class="language-plaintext highlighter-rouge">count(tblName.*)</code> will output differently if there is any null values. To restore the behavior before Spark 3.2, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.allowStarWithSingleTableIdentifierInCount</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.2, we support typed literals in the partition spec of INSERT and ADD/DROP/RENAME PARTITION. For example, <code class="language-plaintext highlighter-rouge">ADD PARTITION(dt = date'2020-01-01')</code> adds a partition with date value <code class="language-plaintext highlighter-rouge">2020-01-01</code>. In Spark 3.1 and earlier, the partition value will be parsed as string value <code class="language-plaintext highlighter-rouge">date '2020-01-01'</code>, which is an illegal date value, and we add a partition with null value at the end.</p>
</li>
<li>
<p>In Spark 3.2, <code class="language-plaintext highlighter-rouge">DataFrameNaFunctions.replace()</code> no longer uses exact string match for the input column names, to match the SQL syntax and support qualified column names. Input column name having a dot in the name (not nested) needs to be escaped with backtick `. Now, it throws <code class="language-plaintext highlighter-rouge">AnalysisException</code> if the column is not found in the data frame schema. It also throws <code class="language-plaintext highlighter-rouge">IllegalArgumentException</code> if the input column name is a nested column. In Spark 3.1 and earlier, it used to ignore invalid input column name and nested column name.</p>
</li>
<li>
<p>In Spark 3.2, the dates subtraction expression such as <code class="language-plaintext highlighter-rouge">date1 - date2</code> returns values of <code class="language-plaintext highlighter-rouge">DayTimeIntervalType</code>. In Spark 3.1 and earlier, the returned type is <code class="language-plaintext highlighter-rouge">CalendarIntervalType</code>. To restore the behavior before Spark 3.2, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.interval.enabled</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.2, the timestamps subtraction expression such as <code class="language-plaintext highlighter-rouge">timestamp '2021-03-31 23:48:00' - timestamp '2021-01-01 00:00:00'</code> returns values of <code class="language-plaintext highlighter-rouge">DayTimeIntervalType</code>. In Spark 3.1 and earlier, the type of the same expression is <code class="language-plaintext highlighter-rouge">CalendarIntervalType</code>. To restore the behavior before Spark 3.2, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.interval.enabled</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.2, <code class="language-plaintext highlighter-rouge">CREATE TABLE .. LIKE ..</code> command can not use reserved properties. You need their specific clauses to specify them, for example, <code class="language-plaintext highlighter-rouge">CREATE TABLE test1 LIKE test LOCATION 'some path'</code>. You can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.notReserveProperties</code> to <code class="language-plaintext highlighter-rouge">true</code> to ignore the <code class="language-plaintext highlighter-rouge">ParseException</code>, in this case, these properties will be silently removed, for example: <code class="language-plaintext highlighter-rouge">TBLPROPERTIES('owner'='yao')</code> will have no effect. In Spark version 3.1 and below, the reserved properties can be used in <code class="language-plaintext highlighter-rouge">CREATE TABLE .. LIKE ..</code> command but have no side effects, for example, <code class="language-plaintext highlighter-rouge">TBLPROPERTIES('location'='/tmp')</code> does not change the location of the table but only create a headless property just like <code class="language-plaintext highlighter-rouge">'a'='b'</code>.</p>
</li>
<li>
<p>In Spark 3.2, <code class="language-plaintext highlighter-rouge">TRANSFORM</code> operator can&#8217;t support alias in inputs. In Spark 3.1 and earlier, we can write script transform like <code class="language-plaintext highlighter-rouge">SELECT TRANSFORM(a AS c1, b AS c2) USING 'cat' FROM TBL</code>.</p>
</li>
<li>
<p>In Spark 3.2, <code class="language-plaintext highlighter-rouge">TRANSFORM</code> operator can support <code class="language-plaintext highlighter-rouge">ArrayType/MapType/StructType</code> without Hive SerDe, in this mode, we use <code class="language-plaintext highlighter-rouge">StructsToJosn</code> to convert <code class="language-plaintext highlighter-rouge">ArrayType/MapType/StructType</code> column to <code class="language-plaintext highlighter-rouge">STRING</code> and use <code class="language-plaintext highlighter-rouge">JsonToStructs</code> to parse <code class="language-plaintext highlighter-rouge">STRING</code> to <code class="language-plaintext highlighter-rouge">ArrayType/MapType/StructType</code>. In Spark 3.1, Spark just support case <code class="language-plaintext highlighter-rouge">ArrayType/MapType/StructType</code> column as <code class="language-plaintext highlighter-rouge">STRING</code> but can&#8217;t support parse <code class="language-plaintext highlighter-rouge">STRING</code> to <code class="language-plaintext highlighter-rouge">ArrayType/MapType/StructType</code> output columns.</p>
</li>
<li>
<p>In Spark 3.2, the unit-to-unit interval literals like <code class="language-plaintext highlighter-rouge">INTERVAL '1-1' YEAR TO MONTH</code> and the unit list interval literals like <code class="language-plaintext highlighter-rouge">INTERVAL '3' DAYS '1' HOUR</code> are converted to ANSI interval types: <code class="language-plaintext highlighter-rouge">YearMonthIntervalType</code> or <code class="language-plaintext highlighter-rouge">DayTimeIntervalType</code>. In Spark 3.1 and earlier, such interval literals are converted to <code class="language-plaintext highlighter-rouge">CalendarIntervalType</code>. To restore the behavior before Spark 3.2, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.interval.enabled</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.2, the unit list interval literals can not mix year-month fields (YEAR and MONTH) and day-time fields (WEEK, DAY, &#8230;, MICROSECOND). For example, <code class="language-plaintext highlighter-rouge">INTERVAL 1 month 1 hour</code> is invalid in Spark 3.2. In Spark 3.1 and earlier, there is no such limitation and the literal returns value of <code class="language-plaintext highlighter-rouge">CalendarIntervalType</code>. To restore the behavior before Spark 3.2, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.interval.enabled</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.2, Spark supports <code class="language-plaintext highlighter-rouge">DayTimeIntervalType</code> and <code class="language-plaintext highlighter-rouge">YearMonthIntervalType</code> as inputs and outputs of <code class="language-plaintext highlighter-rouge">TRANSFORM</code> clause in Hive <code class="language-plaintext highlighter-rouge">SERDE</code> mode, the behavior is different between Hive <code class="language-plaintext highlighter-rouge">SERDE</code> mode and <code class="language-plaintext highlighter-rouge">ROW FORMAT DELIMITED</code> mode when these two types are used as inputs. In Hive <code class="language-plaintext highlighter-rouge">SERDE</code> mode, <code class="language-plaintext highlighter-rouge">DayTimeIntervalType</code> column is converted to <code class="language-plaintext highlighter-rouge">HiveIntervalDayTime</code>, its string format is <code class="language-plaintext highlighter-rouge">[-]?d h:m:s.n</code>, but in <code class="language-plaintext highlighter-rouge">ROW FORMAT DELIMITED</code> mode the format is <code class="language-plaintext highlighter-rouge">INTERVAL '[-]?d h:m:s.n' DAY TO TIME</code>. In Hive <code class="language-plaintext highlighter-rouge">SERDE</code> mode, <code class="language-plaintext highlighter-rouge">YearMonthIntervalType</code> column is converted to <code class="language-plaintext highlighter-rouge">HiveIntervalYearMonth</code>, its string format is <code class="language-plaintext highlighter-rouge">[-]?y-m</code>, but in <code class="language-plaintext highlighter-rouge">ROW FORMAT DELIMITED</code> mode the format is <code class="language-plaintext highlighter-rouge">INTERVAL '[-]?y-m' YEAR TO MONTH</code>.</p>
</li>
<li>
<p>In Spark 3.2, <code class="language-plaintext highlighter-rouge">hash(0) == hash(-0)</code> for floating point types. Previously, different values were generated.</p>
</li>
<li>
<p>In Spark 3.2, <code class="language-plaintext highlighter-rouge">CREATE TABLE AS SELECT</code> with non-empty <code class="language-plaintext highlighter-rouge">LOCATION</code> will throw <code class="language-plaintext highlighter-rouge">AnalysisException</code>. To restore the behavior before Spark 3.2, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.allowNonEmptyLocationInCTAS</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.2, special datetime values such as <code class="language-plaintext highlighter-rouge">epoch</code>, <code class="language-plaintext highlighter-rouge">today</code>, <code class="language-plaintext highlighter-rouge">yesterday</code>, <code class="language-plaintext highlighter-rouge">tomorrow</code>, and <code class="language-plaintext highlighter-rouge">now</code> are supported in typed literals or in cast of foldable strings only, for instance, <code class="language-plaintext highlighter-rouge">select timestamp'now'</code> or <code class="language-plaintext highlighter-rouge">select cast('today' as date)</code>. In Spark 3.1 and 3.0, such special values are supported in any casts of strings to dates/timestamps. To keep these special values as dates/timestamps in Spark 3.1 and 3.0, you should replace them manually, e.g. <code class="language-plaintext highlighter-rouge">if (c in ('now', 'today'), current_date(), cast(c as date))</code>.</p>
</li>
<li>
<p>In Spark 3.2, <code class="language-plaintext highlighter-rouge">FloatType</code> is mapped to <code class="language-plaintext highlighter-rouge">FLOAT</code> in MySQL. Prior to this, it used to be mapped to <code class="language-plaintext highlighter-rouge">REAL</code>, which is by default a synonym to <code class="language-plaintext highlighter-rouge">DOUBLE PRECISION</code> in MySQL.</p>
</li>
<li>
<p>In Spark 3.2, the query executions triggered by <code class="language-plaintext highlighter-rouge">DataFrameWriter</code> are always named <code class="language-plaintext highlighter-rouge">command</code> when being sent to <code class="language-plaintext highlighter-rouge">QueryExecutionListener</code>. In Spark 3.1 and earlier, the name is one of <code class="language-plaintext highlighter-rouge">save</code>, <code class="language-plaintext highlighter-rouge">insertInto</code>, <code class="language-plaintext highlighter-rouge">saveAsTable</code>.</p>
</li>
<li>
<p>In Spark 3.2, <code class="language-plaintext highlighter-rouge">Dataset.unionByName</code> with <code class="language-plaintext highlighter-rouge">allowMissingColumns</code> set to true will add missing nested fields to the end of structs. In Spark 3.1, nested struct fields are sorted alphabetically.</p>
</li>
<li>
<p>In Spark 3.2, create/alter view will fail if the input query output columns contain auto-generated alias. This is necessary to make sure the query output column names are stable across different spark versions. To restore the behavior before Spark 3.2, set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.allowAutoGeneratedAliasForView</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.2, date +/- interval with only day-time fields such as <code class="language-plaintext highlighter-rouge">date '2011-11-11' + interval 12 hours</code> returns timestamp. In Spark 3.1 and earlier, the same expression returns date. To restore the behavior before Spark 3.2, you can use <code class="language-plaintext highlighter-rouge">cast</code> to convert timestamp as date.</p>
</li>
<li>Since Spark 3.2.3, for <code class="language-plaintext highlighter-rouge">SELECT ... GROUP BY a GROUPING SETS (b)</code>-style SQL statements, <code class="language-plaintext highlighter-rouge">grouping__id</code> returns different values from Apache Spark 3.2.0, 3.2.1, and 3.2.2. It computes based on user-given group-by expressions plus grouping set columns. To restore the behavior before 3.2.3, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.groupingIdWithAppendedUserGroupBy</code>. For details, see <a href="https://issues.apache.org/jira/browse/SPARK-40218">SPARK-40218</a> and <a href="https://issues.apache.org/jira/browse/SPARK-40562">SPARK-40562</a>.</li>
</ul>
<h2 id="upgrading-from-spark-sql-30-to-31">Upgrading from Spark SQL 3.0 to 3.1</h2>
<ul>
<li>
<p>In Spark 3.1, statistical aggregation function includes <code class="language-plaintext highlighter-rouge">std</code>, <code class="language-plaintext highlighter-rouge">stddev</code>, <code class="language-plaintext highlighter-rouge">stddev_samp</code>, <code class="language-plaintext highlighter-rouge">variance</code>, <code class="language-plaintext highlighter-rouge">var_samp</code>, <code class="language-plaintext highlighter-rouge">skewness</code>, <code class="language-plaintext highlighter-rouge">kurtosis</code>, <code class="language-plaintext highlighter-rouge">covar_samp</code>, <code class="language-plaintext highlighter-rouge">corr</code> will return <code class="language-plaintext highlighter-rouge">NULL</code> instead of <code class="language-plaintext highlighter-rouge">Double.NaN</code> when <code class="language-plaintext highlighter-rouge">DivideByZero</code> occurs during expression evaluation, for example, when <code class="language-plaintext highlighter-rouge">stddev_samp</code> applied on a single element set. In Spark version 3.0 and earlier, it will return <code class="language-plaintext highlighter-rouge">Double.NaN</code> in such case. To restore the behavior before Spark 3.1, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.statisticalAggregate</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.1, grouping_id() returns long values. In Spark version 3.0 and earlier, this function returns int values. To restore the behavior before Spark 3.1, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.integerGroupingId</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.1, SQL UI data adopts the <code class="language-plaintext highlighter-rouge">formatted</code> mode for the query plan explain results. To restore the behavior before Spark 3.1, you can set <code class="language-plaintext highlighter-rouge">spark.sql.ui.explainMode</code> to <code class="language-plaintext highlighter-rouge">extended</code>.</p>
</li>
<li>
<p>In Spark 3.1, <code class="language-plaintext highlighter-rouge">from_unixtime</code>, <code class="language-plaintext highlighter-rouge">unix_timestamp</code>,<code class="language-plaintext highlighter-rouge">to_unix_timestamp</code>, <code class="language-plaintext highlighter-rouge">to_timestamp</code> and <code class="language-plaintext highlighter-rouge">to_date</code> will fail if the specified datetime pattern is invalid. In Spark 3.0 or earlier, they result <code class="language-plaintext highlighter-rouge">NULL</code>.</p>
</li>
<li>
<p>In Spark 3.1, the Parquet, ORC, Avro and JSON datasources throw the exception <code class="language-plaintext highlighter-rouge">org.apache.spark.sql.AnalysisException: Found duplicate column(s) in the data schema</code> in read if they detect duplicate names in top-level columns as well in nested structures. The datasources take into account the SQL config <code class="language-plaintext highlighter-rouge">spark.sql.caseSensitive</code> while detecting column name duplicates.</p>
</li>
<li>
<p>In Spark 3.1, structs and maps are wrapped by the <code class="language-plaintext highlighter-rouge">{}</code> brackets in casting them to strings. For instance, the <code class="language-plaintext highlighter-rouge">show()</code> action and the <code class="language-plaintext highlighter-rouge">CAST</code> expression use such brackets. In Spark 3.0 and earlier, the <code class="language-plaintext highlighter-rouge">[]</code> brackets are used for the same purpose. To restore the behavior before Spark 3.1, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.castComplexTypesToString.enabled</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.1, NULL elements of structures, arrays and maps are converted to &#8220;null&#8221; in casting them to strings. In Spark 3.0 or earlier, NULL elements are converted to empty strings. To restore the behavior before Spark 3.1, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.castComplexTypesToString.enabled</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.1, when <code class="language-plaintext highlighter-rouge">spark.sql.ansi.enabled</code> is false, Spark always returns null if the sum of decimal type column overflows. In Spark 3.0 or earlier, in the case, the sum of decimal type column may return null or incorrect result, or even fails at runtime (depending on the actual query plan execution).</p>
</li>
<li>
<p>In Spark 3.1, <code class="language-plaintext highlighter-rouge">path</code> option cannot coexist when the following methods are called with path parameter(s): <code class="language-plaintext highlighter-rouge">DataFrameReader.load()</code>, <code class="language-plaintext highlighter-rouge">DataFrameWriter.save()</code>, <code class="language-plaintext highlighter-rouge">DataStreamReader.load()</code>, or <code class="language-plaintext highlighter-rouge">DataStreamWriter.start()</code>. In addition, <code class="language-plaintext highlighter-rouge">paths</code> option cannot coexist for <code class="language-plaintext highlighter-rouge">DataFrameReader.load()</code>. For example, <code class="language-plaintext highlighter-rouge">spark.read.format("csv").option("path", "/tmp").load("/tmp2")</code> or <code class="language-plaintext highlighter-rouge">spark.read.option("path", "/tmp").csv("/tmp2")</code> will throw <code class="language-plaintext highlighter-rouge">org.apache.spark.sql.AnalysisException</code>. In Spark version 3.0 and below, <code class="language-plaintext highlighter-rouge">path</code> option is overwritten if one path parameter is passed to above methods; <code class="language-plaintext highlighter-rouge">path</code> option is added to the overall paths if multiple path parameters are passed to <code class="language-plaintext highlighter-rouge">DataFrameReader.load()</code>. To restore the behavior before Spark 3.1, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.pathOptionBehavior.enabled</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.1, <code class="language-plaintext highlighter-rouge">IllegalArgumentException</code> is returned for the incomplete interval literals, e.g. <code class="language-plaintext highlighter-rouge">INTERVAL '1'</code>, <code class="language-plaintext highlighter-rouge">INTERVAL '1 DAY 2'</code>, which are invalid. In Spark 3.0, these literals result in <code class="language-plaintext highlighter-rouge">NULL</code>s.</p>
</li>
<li>
<p>In Spark 3.1, we remove the built-in Hive 1.2. You need to migrate your custom SerDes to Hive 2.3. See <a href="https://issues.apache.org/jira/browse/HIVE-15167">HIVE-15167</a> for more details.</p>
</li>
<li>
<p>In Spark 3.1, loading and saving of timestamps from/to parquet files fails if the timestamps are before 1900-01-01 00:00:00Z, and loaded (saved) as the INT96 type. In Spark 3.0, the actions don&#8217;t fail but might lead to shifting of the input timestamps due to rebasing from/to Julian to/from Proleptic Gregorian calendar. To restore the behavior before Spark 3.1, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.parquet.int96RebaseModeInRead</code> or/and <code class="language-plaintext highlighter-rouge">spark.sql.legacy.parquet.int96RebaseModeInWrite</code> to <code class="language-plaintext highlighter-rouge">LEGACY</code>.</p>
</li>
<li>
<p>In Spark 3.1, the <code class="language-plaintext highlighter-rouge">schema_of_json</code> and <code class="language-plaintext highlighter-rouge">schema_of_csv</code> functions return the schema in the SQL format in which field names are quoted. In Spark 3.0, the function returns a catalog string without field quoting and in lower case.</p>
</li>
<li>
<p>In Spark 3.1, refreshing a table will trigger an uncache operation for all other caches that reference the table, even if the table itself is not cached. In Spark 3.0 the operation will only be triggered if the table itself is cached.</p>
</li>
<li>
<p>In Spark 3.1, creating or altering a permanent view will capture runtime SQL configs and store them as view properties. These configs will be applied during the parsing and analysis phases of the view resolution. To restore the behavior before Spark 3.1, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.useCurrentConfigsForView</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.1, the temporary view will have same behaviors with the permanent view, i.e. capture and store runtime SQL configs, SQL text, catalog and namespace. The capatured view properties will be applied during the parsing and analysis phases of the view resolution. To restore the behavior before Spark 3.1, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.storeAnalyzedPlanForView</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.1, temporary view created via <code class="language-plaintext highlighter-rouge">CACHE TABLE ... AS SELECT</code> will also have the same behavior with permanent view. In particular, when the temporary view is dropped, Spark will invalidate all its cache dependents, as well as the cache for the temporary view itself. This is different from Spark 3.0 and below, which only does the latter. To restore the previous behavior, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.storeAnalyzedPlanForView</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>Since Spark 3.1, CHAR/CHARACTER and VARCHAR types are supported in the table schema. Table scan/insertion will respect the char/varchar semantic. If char/varchar is used in places other than table schema, an exception will be thrown (CAST is an exception that simply treats char/varchar as string like before). To restore the behavior before Spark 3.1, which treats them as STRING types and ignores a length parameter, e.g. <code class="language-plaintext highlighter-rouge">CHAR(4)</code>, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.charVarcharAsString</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.1, <code class="language-plaintext highlighter-rouge">AnalysisException</code> is replaced by its sub-classes that are thrown for tables from Hive external catalog in the following situations:</p>
<ul>
<li><code class="language-plaintext highlighter-rouge">ALTER TABLE .. ADD PARTITION</code> throws <code class="language-plaintext highlighter-rouge">PartitionsAlreadyExistException</code> if new partition exists already</li>
<li><code class="language-plaintext highlighter-rouge">ALTER TABLE .. DROP PARTITION</code> throws <code class="language-plaintext highlighter-rouge">NoSuchPartitionsException</code> for not existing partitions</li>
</ul>
</li>
</ul>
<h2 id="upgrading-from-spark-sql-301-to-302">Upgrading from Spark SQL 3.0.1 to 3.0.2</h2>
<ul>
<li>In Spark 3.0.2, <code class="language-plaintext highlighter-rouge">AnalysisException</code> is replaced by its sub-classes that are thrown for tables from Hive external catalog in the following situations:
<ul>
<li><code class="language-plaintext highlighter-rouge">ALTER TABLE .. ADD PARTITION</code> throws <code class="language-plaintext highlighter-rouge">PartitionsAlreadyExistException</code> if new partition exists already</li>
<li><code class="language-plaintext highlighter-rouge">ALTER TABLE .. DROP PARTITION</code> throws <code class="language-plaintext highlighter-rouge">NoSuchPartitionsException</code> for not existing partitions</li>
</ul>
</li>
<li>
<p>In Spark 3.0.2, <code class="language-plaintext highlighter-rouge">PARTITION(col=null)</code> is always parsed as a null literal in the partition spec. In Spark 3.0.1 or earlier, it is parsed as a string literal of its text representation, e.g., string &#8220;null&#8221;, if the partition column is string type. To restore the legacy behavior, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.parseNullPartitionSpecAsStringLiteral</code> as true.</p>
</li>
<li>In Spark 3.0.0, the output schema of <code class="language-plaintext highlighter-rouge">SHOW DATABASES</code> becomes <code class="language-plaintext highlighter-rouge">namespace: string</code>. In Spark version 2.4 and earlier, the schema was <code class="language-plaintext highlighter-rouge">databaseName: string</code>. Since Spark 3.0.2, you can restore the old schema by setting <code class="language-plaintext highlighter-rouge">spark.sql.legacy.keepCommandOutputSchema</code> to <code class="language-plaintext highlighter-rouge">true</code>.</li>
</ul>
<h2 id="upgrading-from-spark-sql-30-to-301">Upgrading from Spark SQL 3.0 to 3.0.1</h2>
<ul>
<li>
<p>In Spark 3.0, JSON datasource and JSON function <code class="language-plaintext highlighter-rouge">schema_of_json</code> infer TimestampType from string values if they match to the pattern defined by the JSON option <code class="language-plaintext highlighter-rouge">timestampFormat</code>. Since version 3.0.1, the timestamp type inference is disabled by default. Set the JSON option <code class="language-plaintext highlighter-rouge">inferTimestamp</code> to <code class="language-plaintext highlighter-rouge">true</code> to enable such type inference.</p>
</li>
<li>
<p>In Spark 3.0, when casting string to integral types(tinyint, smallint, int and bigint), datetime types(date, timestamp and interval) and boolean type, the leading and trailing characters (&lt;= ASCII 32) will be trimmed. For example, <code class="language-plaintext highlighter-rouge">cast('\b1\b' as int)</code> results <code class="language-plaintext highlighter-rouge">1</code>. Since Spark 3.0.1, only the leading and trailing whitespace ASCII characters will be trimmed. For example, <code class="language-plaintext highlighter-rouge">cast('\t1\t' as int)</code> results <code class="language-plaintext highlighter-rouge">1</code> but <code class="language-plaintext highlighter-rouge">cast('\b1\b' as int)</code> results <code class="language-plaintext highlighter-rouge">NULL</code>.</p>
</li>
</ul>
<h2 id="upgrading-from-spark-sql-24-to-30">Upgrading from Spark SQL 2.4 to 3.0</h2>
<h3 id="datasetdataframe-apis">Dataset/DataFrame APIs</h3>
<ul>
<li>
<p>In Spark 3.0, the Dataset and DataFrame API <code class="language-plaintext highlighter-rouge">unionAll</code> is no longer deprecated. It is an alias for <code class="language-plaintext highlighter-rouge">union</code>.</p>
</li>
<li>
<p>In Spark 2.4 and below, <code class="language-plaintext highlighter-rouge">Dataset.groupByKey</code> results to a grouped dataset with key attribute is wrongly named as &#8220;value&#8221;, if the key is non-struct type, for example, int, string, array, etc. This is counterintuitive and makes the schema of aggregation queries unexpected. For example, the schema of <code class="language-plaintext highlighter-rouge">ds.groupByKey(...).count()</code> is <code class="language-plaintext highlighter-rouge">(value, count)</code>. Since Spark 3.0, we name the grouping attribute to &#8220;key&#8221;. The old behavior is preserved under a newly added configuration <code class="language-plaintext highlighter-rouge">spark.sql.legacy.dataset.nameNonStructGroupingKeyAsValue</code> with a default value of <code class="language-plaintext highlighter-rouge">false</code>.</p>
</li>
<li>
<p>In Spark 3.0, the column metadata will always be propagated in the API <code class="language-plaintext highlighter-rouge">Column.name</code> and <code class="language-plaintext highlighter-rouge">Column.as</code>. In Spark version 2.4 and earlier, the metadata of <code class="language-plaintext highlighter-rouge">NamedExpression</code> is set as the <code class="language-plaintext highlighter-rouge">explicitMetadata</code> for the new column at the time the API is called, it won&#8217;t change even if the underlying <code class="language-plaintext highlighter-rouge">NamedExpression</code> changes metadata. To restore the behavior before Spark 2.4, you can use the API <code class="language-plaintext highlighter-rouge">as(alias: String, metadata: Metadata)</code> with explicit metadata.</p>
</li>
</ul>
<h3 id="ddl-statements">DDL Statements</h3>
<ul>
<li>
<p>In Spark 3.0, when inserting a value into a table column with a different data type, the type coercion is performed as per ANSI SQL standard. Certain unreasonable type conversions such as converting <code class="language-plaintext highlighter-rouge">string</code> to <code class="language-plaintext highlighter-rouge">int</code> and <code class="language-plaintext highlighter-rouge">double</code> to <code class="language-plaintext highlighter-rouge">boolean</code> are disallowed. A runtime exception is thrown if the value is out-of-range for the data type of the column. In Spark version 2.4 and below, type conversions during table insertion are allowed as long as they are valid <code class="language-plaintext highlighter-rouge">Cast</code>. When inserting an out-of-range value to an integral field, the low-order bits of the value is inserted(the same as Java/Scala numeric type casting). For example, if 257 is inserted to a field of byte type, the result is 1. The behavior is controlled by the option <code class="language-plaintext highlighter-rouge">spark.sql.storeAssignmentPolicy</code>, with a default value as &#8220;ANSI&#8221;. Setting the option as &#8220;Legacy&#8221; restores the previous behavior.</p>
</li>
<li>
<p>The <code class="language-plaintext highlighter-rouge">ADD JAR</code> command previously returned a result set with the single value 0. It now returns an empty result set.</p>
</li>
<li>
<p>Spark 2.4 and below: the <code class="language-plaintext highlighter-rouge">SET</code> command works without any warnings even if the specified key is for <code class="language-plaintext highlighter-rouge">SparkConf</code> entries and it has no effect because the command does not update <code class="language-plaintext highlighter-rouge">SparkConf</code>, but the behavior might confuse users. In 3.0, the command fails if a <code class="language-plaintext highlighter-rouge">SparkConf</code> key is used. You can disable such a check by setting <code class="language-plaintext highlighter-rouge">spark.sql.legacy.setCommandRejectsSparkCoreConfs</code> to <code class="language-plaintext highlighter-rouge">false</code>.</p>
</li>
<li>
<p>Refreshing a cached table would trigger a table uncache operation and then a table cache (lazily) operation. In Spark version 2.4 and below, the cache name and storage level are not preserved before the uncache operation. Therefore, the cache name and storage level could be changed unexpectedly. In Spark 3.0, cache name and storage level are first preserved for cache recreation. It helps to maintain a consistent cache behavior upon table refreshing.</p>
</li>
<li>
<p>In Spark 3.0, the properties listing below become reserved; commands fail if you specify reserved properties in places like <code class="language-plaintext highlighter-rouge">CREATE DATABASE ... WITH DBPROPERTIES</code> and <code class="language-plaintext highlighter-rouge">ALTER TABLE ... SET TBLPROPERTIES</code>. You need their specific clauses to specify them, for example, <code class="language-plaintext highlighter-rouge">CREATE DATABASE test COMMENT 'any comment' LOCATION 'some path'</code>. You can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.notReserveProperties</code> to <code class="language-plaintext highlighter-rouge">true</code> to ignore the <code class="language-plaintext highlighter-rouge">ParseException</code>, in this case, these properties will be silently removed, for example: <code class="language-plaintext highlighter-rouge">SET DBPROPERTIES('location'='/tmp')</code> will have no effect. In Spark version 2.4 and below, these properties are neither reserved nor have side effects, for example, <code class="language-plaintext highlighter-rouge">SET DBPROPERTIES('location'='/tmp')</code> do not change the location of the database but only create a headless property just like <code class="language-plaintext highlighter-rouge">'a'='b'</code>.</p>
<table>
<thead>
<tr>
<th>Property (case sensitive)</th>
<th>Database Reserved</th>
<th>Table Reserved</th>
<th>Remarks</th>
</tr>
</thead>
<tbody>
<tr>
<td>provider</td>
<td>no</td>
<td>yes</td>
<td>For tables, use the <code class="language-plaintext highlighter-rouge">USING</code> clause to specify it. Once set, it can&#8217;t be changed.</td>
</tr>
<tr>
<td>location</td>
<td>yes</td>
<td>yes</td>
<td>For databases and tables, use the <code class="language-plaintext highlighter-rouge">LOCATION</code> clause to specify it.</td>
</tr>
<tr>
<td>owner</td>
<td>yes</td>
<td>yes</td>
<td>For databases and tables, it is determined by the user who runs spark and create the table.</td>
</tr>
</tbody>
</table>
</li>
<li>
<p>In Spark 3.0, you can use <code class="language-plaintext highlighter-rouge">ADD FILE</code> to add file directories as well. Earlier you could add only single files using this command. To restore the behavior of earlier versions, set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.addSingleFileInAddFile</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.0, <code class="language-plaintext highlighter-rouge">SHOW TBLPROPERTIES</code> throws <code class="language-plaintext highlighter-rouge">AnalysisException</code> if the table does not exist. In Spark version 2.4 and below, this scenario caused <code class="language-plaintext highlighter-rouge">NoSuchTableException</code>.</p>
</li>
<li>
<p>In Spark 3.0, <code class="language-plaintext highlighter-rouge">SHOW CREATE TABLE table_identifier</code> always returns Spark DDL, even when the given table is a Hive SerDe table. For generating Hive DDL, use <code class="language-plaintext highlighter-rouge">SHOW CREATE TABLE table_identifier AS SERDE</code> command instead.</p>
</li>
<li>
<p>In Spark 3.0, column of CHAR type is not allowed in non-Hive-Serde tables, and CREATE/ALTER TABLE commands will fail if CHAR type is detected. Please use STRING type instead. In Spark version 2.4 and below, CHAR type is treated as STRING type and the length parameter is simply ignored.</p>
</li>
</ul>
<h3 id="udfs-and-built-in-functions">UDFs and Built-in Functions</h3>
<ul>
<li>
<p>In Spark 3.0, the <code class="language-plaintext highlighter-rouge">date_add</code> and <code class="language-plaintext highlighter-rouge">date_sub</code> functions accepts only int, smallint, tinyint as the 2nd argument; fractional and non-literal strings are not valid anymore, for example: <code class="language-plaintext highlighter-rouge">date_add(cast('1964-05-23' as date), '12.34')</code> causes <code class="language-plaintext highlighter-rouge">AnalysisException</code>. Note that, string literals are still allowed, but Spark will throw <code class="language-plaintext highlighter-rouge">AnalysisException</code> if the string content is not a valid integer. In Spark version 2.4 and below, if the 2nd argument is fractional or string value, it is coerced to int value, and the result is a date value of <code class="language-plaintext highlighter-rouge">1964-06-04</code>.</p>
</li>
<li>
<p>In Spark 3.0, the function <code class="language-plaintext highlighter-rouge">percentile_approx</code> and its alias <code class="language-plaintext highlighter-rouge">approx_percentile</code> only accept integral value with range in <code class="language-plaintext highlighter-rouge">[1, 2147483647]</code> as its 3rd argument <code class="language-plaintext highlighter-rouge">accuracy</code>, fractional and string types are disallowed, for example, <code class="language-plaintext highlighter-rouge">percentile_approx(10.0, 0.2, 1.8D)</code> causes <code class="language-plaintext highlighter-rouge">AnalysisException</code>. In Spark version 2.4 and below, if <code class="language-plaintext highlighter-rouge">accuracy</code> is fractional or string value, it is coerced to an int value, <code class="language-plaintext highlighter-rouge">percentile_approx(10.0, 0.2, 1.8D)</code> is operated as <code class="language-plaintext highlighter-rouge">percentile_approx(10.0, 0.2, 1)</code> which results in <code class="language-plaintext highlighter-rouge">10.0</code>.</p>
</li>
<li>
<p>In Spark 3.0, an analysis exception is thrown when hash expressions are applied on elements of <code class="language-plaintext highlighter-rouge">MapType</code>. To restore the behavior before Spark 3.0, set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.allowHashOnMapType</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.0, when the <code class="language-plaintext highlighter-rouge">array</code>/<code class="language-plaintext highlighter-rouge">map</code> function is called without any parameters, it returns an empty collection with <code class="language-plaintext highlighter-rouge">NullType</code> as element type. In Spark version 2.4 and below, it returns an empty collection with <code class="language-plaintext highlighter-rouge">StringType</code> as element type. To restore the behavior before Spark 3.0, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.createEmptyCollectionUsingStringType</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.0, the <code class="language-plaintext highlighter-rouge">from_json</code> functions supports two modes - <code class="language-plaintext highlighter-rouge">PERMISSIVE</code> and <code class="language-plaintext highlighter-rouge">FAILFAST</code>. The modes can be set via the <code class="language-plaintext highlighter-rouge">mode</code> option. The default mode became <code class="language-plaintext highlighter-rouge">PERMISSIVE</code>. In previous versions, behavior of <code class="language-plaintext highlighter-rouge">from_json</code> did not conform to either <code class="language-plaintext highlighter-rouge">PERMISSIVE</code> nor <code class="language-plaintext highlighter-rouge">FAILFAST</code>, especially in processing of malformed JSON records. For example, the JSON string <code class="language-plaintext highlighter-rouge">{"a" 1}</code> with the schema <code class="language-plaintext highlighter-rouge">a INT</code> is converted to <code class="language-plaintext highlighter-rouge">null</code> by previous versions but Spark 3.0 converts it to <code class="language-plaintext highlighter-rouge">Row(null)</code>.</p>
</li>
<li>
<p>In Spark version 2.4 and below, you can create map values with map type key via built-in function such as <code class="language-plaintext highlighter-rouge">CreateMap</code>, <code class="language-plaintext highlighter-rouge">MapFromArrays</code>, etc. In Spark 3.0, it&#8217;s not allowed to create map values with map type key with these built-in functions. Users can use <code class="language-plaintext highlighter-rouge">map_entries</code> function to convert map to array&lt;struct&lt;key, value&#187; as a workaround. In addition, users can still read map values with map type key from data source or Java/Scala collections, though it is discouraged.</p>
</li>
<li>
<p>In Spark version 2.4 and below, you can create a map with duplicated keys via built-in functions like <code class="language-plaintext highlighter-rouge">CreateMap</code>, <code class="language-plaintext highlighter-rouge">StringToMap</code>, etc. The behavior of map with duplicated keys is undefined, for example, map look up respects the duplicated key appears first, <code class="language-plaintext highlighter-rouge">Dataset.collect</code> only keeps the duplicated key appears last, <code class="language-plaintext highlighter-rouge">MapKeys</code> returns duplicated keys, etc. In Spark 3.0, Spark throws <code class="language-plaintext highlighter-rouge">RuntimeException</code> when duplicated keys are found. You can set <code class="language-plaintext highlighter-rouge">spark.sql.mapKeyDedupPolicy</code> to <code class="language-plaintext highlighter-rouge">LAST_WIN</code> to deduplicate map keys with last wins policy. Users may still read map values with duplicated keys from data sources which do not enforce it (for example, Parquet), the behavior is undefined.</p>
</li>
<li>
<p>In Spark 3.0, using <code class="language-plaintext highlighter-rouge">org.apache.spark.sql.functions.udf(AnyRef, DataType)</code> is not allowed by default. Remove the return type parameter to automatically switch to typed Scala udf is recommended, or set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.allowUntypedScalaUDF</code> to true to keep using it. In Spark version 2.4 and below, if <code class="language-plaintext highlighter-rouge">org.apache.spark.sql.functions.udf(AnyRef, DataType)</code> gets a Scala closure with primitive-type argument, the returned UDF returns null if the input values is null. However, in Spark 3.0, the UDF returns the default value of the Java type if the input value is null. For example, <code class="language-plaintext highlighter-rouge">val f = udf((x: Int) =&gt; x, IntegerType)</code>, <code class="language-plaintext highlighter-rouge">f($"x")</code> returns null in Spark 2.4 and below if column <code class="language-plaintext highlighter-rouge">x</code> is null, and return 0 in Spark 3.0. This behavior change is introduced because Spark 3.0 is built with Scala 2.12 by default.</p>
</li>
<li>
<p>In Spark 3.0, a higher-order function <code class="language-plaintext highlighter-rouge">exists</code> follows the three-valued boolean logic, that is, if the <code class="language-plaintext highlighter-rouge">predicate</code> returns any <code class="language-plaintext highlighter-rouge">null</code>s and no <code class="language-plaintext highlighter-rouge">true</code> is obtained, then <code class="language-plaintext highlighter-rouge">exists</code> returns <code class="language-plaintext highlighter-rouge">null</code> instead of <code class="language-plaintext highlighter-rouge">false</code>. For example, <code class="language-plaintext highlighter-rouge">exists(array(1, null, 3), x -&gt; x % 2 == 0)</code> is <code class="language-plaintext highlighter-rouge">null</code>. The previous behavior can be restored by setting <code class="language-plaintext highlighter-rouge">spark.sql.legacy.followThreeValuedLogicInArrayExists</code> to <code class="language-plaintext highlighter-rouge">false</code>.</p>
</li>
<li>
<p>In Spark 3.0, the <code class="language-plaintext highlighter-rouge">add_months</code> function does not adjust the resulting date to a last day of month if the original date is a last day of months. For example, <code class="language-plaintext highlighter-rouge">select add_months(DATE'2019-02-28', 1)</code> results <code class="language-plaintext highlighter-rouge">2019-03-28</code>. In Spark version 2.4 and below, the resulting date is adjusted when the original date is a last day of months. For example, adding a month to <code class="language-plaintext highlighter-rouge">2019-02-28</code> results in <code class="language-plaintext highlighter-rouge">2019-03-31</code>.</p>
</li>
<li>
<p>In Spark version 2.4 and below, the <code class="language-plaintext highlighter-rouge">current_timestamp</code> function returns a timestamp with millisecond resolution only. In Spark 3.0, the function can return the result with microsecond resolution if the underlying clock available on the system offers such resolution.</p>
</li>
<li>
<p>In Spark 3.0, a 0-argument Java UDF is executed in the executor side identically with other UDFs. In Spark version 2.4 and below, the 0-argument Java UDF alone was executed in the driver side, and the result was propagated to executors, which might be more performant in some cases but caused inconsistency with a correctness issue in some cases.</p>
</li>
<li>
<p>The result of <code class="language-plaintext highlighter-rouge">java.lang.Math</code>&#8217;s <code class="language-plaintext highlighter-rouge">log</code>, <code class="language-plaintext highlighter-rouge">log1p</code>, <code class="language-plaintext highlighter-rouge">exp</code>, <code class="language-plaintext highlighter-rouge">expm1</code>, and <code class="language-plaintext highlighter-rouge">pow</code> may vary across platforms. In Spark 3.0, the result of the equivalent SQL functions (including related SQL functions like <code class="language-plaintext highlighter-rouge">LOG10</code>) return values consistent with <code class="language-plaintext highlighter-rouge">java.lang.StrictMath</code>. In virtually all cases this makes no difference in the return value, and the difference is very small, but may not exactly match <code class="language-plaintext highlighter-rouge">java.lang.Math</code> on x86 platforms in cases like, for example, <code class="language-plaintext highlighter-rouge">log(3.0)</code>, whose value varies between <code class="language-plaintext highlighter-rouge">Math.log()</code> and <code class="language-plaintext highlighter-rouge">StrictMath.log()</code>.</p>
</li>
<li>
<p>In Spark 3.0, the <code class="language-plaintext highlighter-rouge">cast</code> function processes string literals such as &#8216;Infinity&#8217;, &#8216;+Infinity&#8217;, &#8216;-Infinity&#8217;, &#8216;NaN&#8217;, &#8216;Inf&#8217;, &#8216;+Inf&#8217;, &#8216;-Inf&#8217; in a case-insensitive manner when casting the literals to <code class="language-plaintext highlighter-rouge">Double</code> or <code class="language-plaintext highlighter-rouge">Float</code> type to ensure greater compatibility with other database systems. This behavior change is illustrated in the table below:</p>
<table>
<thead>
<tr>
<th>Operation</th>
<th>Result before Spark 3.0</th>
<th>Result in Spark 3.0</th>
</tr>
</thead>
<tbody>
<tr>
<td>CAST(&#8216;infinity&#8217; AS DOUBLE)</td>
<td>NULL</td>
<td>Double.PositiveInfinity</td>
</tr>
<tr>
<td>CAST(&#8216;+infinity&#8217; AS DOUBLE)</td>
<td>NULL</td>
<td>Double.PositiveInfinity</td>
</tr>
<tr>
<td>CAST(&#8216;inf&#8217; AS DOUBLE)</td>
<td>NULL</td>
<td>Double.PositiveInfinity</td>
</tr>
<tr>
<td>CAST(&#8216;inf&#8217; AS DOUBLE)</td>
<td>NULL</td>
<td>Double.PositiveInfinity</td>
</tr>
<tr>
<td>CAST(&#8216;-infinity&#8217; AS DOUBLE)</td>
<td>NULL</td>
<td>Double.NegativeInfinity</td>
</tr>
<tr>
<td>CAST(&#8216;-inf&#8217; AS DOUBLE)</td>
<td>NULL</td>
<td>Double.NegativeInfinity</td>
</tr>
<tr>
<td>CAST(&#8216;infinity&#8217; AS FLOAT)</td>
<td>NULL</td>
<td>Float.PositiveInfinity</td>
</tr>
<tr>
<td>CAST(&#8216;+infinity&#8217; AS FLOAT)</td>
<td>NULL</td>
<td>Float.PositiveInfinity</td>
</tr>
<tr>
<td>CAST(&#8216;inf&#8217; AS FLOAT)</td>
<td>NULL</td>
<td>Float.PositiveInfinity</td>
</tr>
<tr>
<td>CAST(&#8216;+inf&#8217; AS FLOAT)</td>
<td>NULL</td>
<td>Float.PositiveInfinity</td>
</tr>
<tr>
<td>CAST(&#8216;-infinity&#8217; AS FLOAT)</td>
<td>NULL</td>
<td>Float.NegativeInfinity</td>
</tr>
<tr>
<td>CAST(&#8216;-inf&#8217; AS FLOAT)</td>
<td>NULL</td>
<td>Float.NegativeInfinity</td>
</tr>
<tr>
<td>CAST(&#8216;nan&#8217; AS DOUBLE)</td>
<td>NULL</td>
<td>Double.NaN</td>
</tr>
<tr>
<td>CAST(&#8216;nan&#8217; AS FLOAT)</td>
<td>NULL</td>
<td>Float.NaN</td>
</tr>
</tbody>
</table>
</li>
<li>
<p>In Spark 3.0, when casting interval values to string type, there is no &#8220;interval&#8221; prefix, for example, <code class="language-plaintext highlighter-rouge">1 days 2 hours</code>. In Spark version 2.4 and below, the string contains the &#8220;interval&#8221; prefix like <code class="language-plaintext highlighter-rouge">interval 1 days 2 hours</code>.</p>
</li>
<li>
<p>In Spark 3.0, when casting string value to integral types(tinyint, smallint, int and bigint), datetime types(date, timestamp and interval) and boolean type, the leading and trailing whitespaces (&lt;= ASCII 32) will be trimmed before converted to these type values, for example, <code class="language-plaintext highlighter-rouge">cast(' 1\t' as int)</code> results <code class="language-plaintext highlighter-rouge">1</code>, <code class="language-plaintext highlighter-rouge">cast(' 1\t' as boolean)</code> results <code class="language-plaintext highlighter-rouge">true</code>, <code class="language-plaintext highlighter-rouge">cast('2019-10-10\t as date)</code> results the date value <code class="language-plaintext highlighter-rouge">2019-10-10</code>. In Spark version 2.4 and below, when casting string to integrals and booleans, it does not trim the whitespaces from both ends; the foregoing results is <code class="language-plaintext highlighter-rouge">null</code>, while to datetimes, only the trailing spaces (= ASCII 32) are removed.</p>
</li>
</ul>
<h3 id="query-engine">Query Engine</h3>
<ul>
<li>
<p>In Spark version 2.4 and below, SQL queries such as <code class="language-plaintext highlighter-rouge">FROM &lt;table&gt;</code> or <code class="language-plaintext highlighter-rouge">FROM &lt;table&gt; UNION ALL FROM &lt;table&gt;</code> are supported by accident. In hive-style <code class="language-plaintext highlighter-rouge">FROM &lt;table&gt; SELECT &lt;expr&gt;</code>, the <code class="language-plaintext highlighter-rouge">SELECT</code> clause is not negligible. Neither Hive nor Presto support this syntax. These queries are treated as invalid in Spark 3.0.</p>
</li>
<li>
<p>In Spark 3.0, the interval literal syntax does not allow multiple from-to units anymore. For example, <code class="language-plaintext highlighter-rouge">SELECT INTERVAL '1-1' YEAR TO MONTH '2-2' YEAR TO MONTH'</code> throws parser exception.</p>
</li>
<li>
<p>In Spark 3.0, numbers written in scientific notation(for example, <code class="language-plaintext highlighter-rouge">1E2</code>) would be parsed as Double. In Spark version 2.4 and below, they&#8217;re parsed as Decimal. To restore the behavior before Spark 3.0, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.exponentLiteralAsDecimal.enabled</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.0, day-time interval strings are converted to intervals with respect to the <code class="language-plaintext highlighter-rouge">from</code> and <code class="language-plaintext highlighter-rouge">to</code> bounds. If an input string does not match to the pattern defined by specified bounds, the <code class="language-plaintext highlighter-rouge">ParseException</code> exception is thrown. For example, <code class="language-plaintext highlighter-rouge">interval '2 10:20' hour to minute</code> raises the exception because the expected format is <code class="language-plaintext highlighter-rouge">[+|-]h[h]:[m]m</code>. In Spark version 2.4, the <code class="language-plaintext highlighter-rouge">from</code> bound was not taken into account, and the <code class="language-plaintext highlighter-rouge">to</code> bound was used to truncate the resulted interval. For instance, the day-time interval string from the showed example is converted to <code class="language-plaintext highlighter-rouge">interval 10 hours 20 minutes</code>. To restore the behavior before Spark 3.0, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.fromDayTimeString.enabled</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.0, negative scale of decimal is not allowed by default, for example, data type of literal like <code class="language-plaintext highlighter-rouge">1E10BD</code> is <code class="language-plaintext highlighter-rouge">DecimalType(11, 0)</code>. In Spark version 2.4 and below, it was <code class="language-plaintext highlighter-rouge">DecimalType(2, -9)</code>. To restore the behavior before Spark 3.0, you can set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.allowNegativeScaleOfDecimal</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.0, the unary arithmetic operator plus(<code class="language-plaintext highlighter-rouge">+</code>) only accepts string, numeric and interval type values as inputs. Besides, <code class="language-plaintext highlighter-rouge">+</code> with an integral string representation is coerced to a double value, for example, <code class="language-plaintext highlighter-rouge">+'1'</code> returns <code class="language-plaintext highlighter-rouge">1.0</code>. In Spark version 2.4 and below, this operator is ignored. There is no type checking for it, thus, all type values with a <code class="language-plaintext highlighter-rouge">+</code> prefix are valid, for example, <code class="language-plaintext highlighter-rouge">+ array(1, 2)</code> is valid and results <code class="language-plaintext highlighter-rouge">[1, 2]</code>. Besides, there is no type coercion for it at all, for example, in Spark 2.4, the result of <code class="language-plaintext highlighter-rouge">+'1'</code> is string <code class="language-plaintext highlighter-rouge">1</code>.</p>
</li>
<li>
<p>In Spark 3.0, Dataset query fails if it contains ambiguous column reference that is caused by self join. A typical example: <code class="language-plaintext highlighter-rouge">val df1 = ...; val df2 = df1.filter(...);</code>, then <code class="language-plaintext highlighter-rouge">df1.join(df2, df1("a") &gt; df2("a"))</code> returns an empty result which is quite confusing. This is because Spark cannot resolve Dataset column references that point to tables being self joined, and <code class="language-plaintext highlighter-rouge">df1("a")</code> is exactly the same as <code class="language-plaintext highlighter-rouge">df2("a")</code> in Spark. To restore the behavior before Spark 3.0, you can set <code class="language-plaintext highlighter-rouge">spark.sql.analyzer.failAmbiguousSelfJoin</code> to <code class="language-plaintext highlighter-rouge">false</code>.</p>
</li>
<li>
<p>In Spark 3.0, <code class="language-plaintext highlighter-rouge">spark.sql.legacy.ctePrecedencePolicy</code> is introduced to control the behavior for name conflicting in the nested WITH clause. By default value <code class="language-plaintext highlighter-rouge">EXCEPTION</code>, Spark throws an AnalysisException, it forces users to choose the specific substitution order they wanted. If set to <code class="language-plaintext highlighter-rouge">CORRECTED</code> (which is recommended), inner CTE definitions take precedence over outer definitions. For example, set the config to <code class="language-plaintext highlighter-rouge">false</code>, <code class="language-plaintext highlighter-rouge">WITH t AS (SELECT 1), t2 AS (WITH t AS (SELECT 2) SELECT * FROM t) SELECT * FROM t2</code> returns <code class="language-plaintext highlighter-rouge">2</code>, while setting it to <code class="language-plaintext highlighter-rouge">LEGACY</code>, the result is <code class="language-plaintext highlighter-rouge">1</code> which is the behavior in version 2.4 and below.</p>
</li>
<li>
<p>In Spark 3.0, configuration <code class="language-plaintext highlighter-rouge">spark.sql.crossJoin.enabled</code> become internal configuration, and is true by default, so by default spark won&#8217;t raise exception on sql with implicit cross join.</p>
</li>
<li>
<p>In Spark version 2.4 and below, float/double -0.0 is semantically equal to 0.0, but -0.0 and 0.0 are considered as different values when used in aggregate grouping keys, window partition keys, and join keys. In Spark 3.0, this bug is fixed. For example, <code class="language-plaintext highlighter-rouge">Seq(-0.0, 0.0).toDF("d").groupBy("d").count()</code> returns <code class="language-plaintext highlighter-rouge">[(0.0, 2)]</code> in Spark 3.0, and <code class="language-plaintext highlighter-rouge">[(0.0, 1), (-0.0, 1)]</code> in Spark 2.4 and below.</p>
</li>
<li>
<p>In Spark version 2.4 and below, invalid time zone ids are silently ignored and replaced by GMT time zone, for example, in the from_utc_timestamp function. In Spark 3.0, such time zone ids are rejected, and Spark throws <code class="language-plaintext highlighter-rouge">java.time.DateTimeException</code>.</p>
</li>
<li>
<p>In Spark 3.0, Proleptic Gregorian calendar is used in parsing, formatting, and converting dates and timestamps as well as in extracting sub-components like years, days and so on. Spark 3.0 uses Java 8 API classes from the <code class="language-plaintext highlighter-rouge">java.time</code> packages that are based on <a href="https://docs.oracle.com/javase/8/docs/api/java/time/chrono/IsoChronology.html">ISO chronology</a>. In Spark version 2.4 and below, those operations are performed using the hybrid calendar (<a href="https://docs.oracle.com/javase/7/docs/api/java/util/GregorianCalendar.html">Julian + Gregorian</a>. The changes impact on the results for dates before October 15, 1582 (Gregorian) and affect on the following Spark 3.0 API:</p>
<ul>
<li>
<p>Parsing/formatting of timestamp/date strings. This effects on CSV/JSON datasources and on the <code class="language-plaintext highlighter-rouge">unix_timestamp</code>, <code class="language-plaintext highlighter-rouge">date_format</code>, <code class="language-plaintext highlighter-rouge">to_unix_timestamp</code>, <code class="language-plaintext highlighter-rouge">from_unixtime</code>, <code class="language-plaintext highlighter-rouge">to_date</code>, <code class="language-plaintext highlighter-rouge">to_timestamp</code> functions when patterns specified by users is used for parsing and formatting. In Spark 3.0, we define our own pattern strings in <a href="sql-ref-datetime-pattern.html">Datetime Patterns for Formatting and Parsing</a>,
which is implemented via <a href="https://docs.oracle.com/javase/8/docs/api/java/time/format/DateTimeFormatter.html">DateTimeFormatter</a> under the hood. New implementation performs strict checking of its input. For example, the <code class="language-plaintext highlighter-rouge">2015-07-22 10:00:00</code> timestamp cannot be parse if pattern is <code class="language-plaintext highlighter-rouge">yyyy-MM-dd</code> because the parser does not consume whole input. Another example is the <code class="language-plaintext highlighter-rouge">31/01/2015 00:00</code> input cannot be parsed by the <code class="language-plaintext highlighter-rouge">dd/MM/yyyy hh:mm</code> pattern because <code class="language-plaintext highlighter-rouge">hh</code> supposes hours in the range <code class="language-plaintext highlighter-rouge">1-12</code>. In Spark version 2.4 and below, <code class="language-plaintext highlighter-rouge">java.text.SimpleDateFormat</code> is used for timestamp/date string conversions, and the supported patterns are described in <a href="https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html">SimpleDateFormat</a>. The old behavior can be restored by setting <code class="language-plaintext highlighter-rouge">spark.sql.legacy.timeParserPolicy</code> to <code class="language-plaintext highlighter-rouge">LEGACY</code>.</p>
</li>
<li>
<p>The <code class="language-plaintext highlighter-rouge">weekofyear</code>, <code class="language-plaintext highlighter-rouge">weekday</code>, <code class="language-plaintext highlighter-rouge">dayofweek</code>, <code class="language-plaintext highlighter-rouge">date_trunc</code>, <code class="language-plaintext highlighter-rouge">from_utc_timestamp</code>, <code class="language-plaintext highlighter-rouge">to_utc_timestamp</code>, and <code class="language-plaintext highlighter-rouge">unix_timestamp</code> functions use java.time API for calculation week number of year, day number of week as well for conversion from/to TimestampType values in UTC time zone.</p>
</li>
<li>
<p>The JDBC options <code class="language-plaintext highlighter-rouge">lowerBound</code> and <code class="language-plaintext highlighter-rouge">upperBound</code> are converted to TimestampType/DateType values in the same way as casting strings to TimestampType/DateType values. The conversion is based on Proleptic Gregorian calendar, and time zone defined by the SQL config <code class="language-plaintext highlighter-rouge">spark.sql.session.timeZone</code>. In Spark version 2.4 and below, the conversion is based on the hybrid calendar (Julian + Gregorian) and on default system time zone.</p>
</li>
<li>
<p>Formatting <code class="language-plaintext highlighter-rouge">TIMESTAMP</code> and <code class="language-plaintext highlighter-rouge">DATE</code> literals.</p>
</li>
<li>
<p>Creating typed <code class="language-plaintext highlighter-rouge">TIMESTAMP</code> and <code class="language-plaintext highlighter-rouge">DATE</code> literals from strings. In Spark 3.0, string conversion to typed <code class="language-plaintext highlighter-rouge">TIMESTAMP</code>/<code class="language-plaintext highlighter-rouge">DATE</code> literals is performed via casting to <code class="language-plaintext highlighter-rouge">TIMESTAMP</code>/<code class="language-plaintext highlighter-rouge">DATE</code> values. For example, <code class="language-plaintext highlighter-rouge">TIMESTAMP '2019-12-23 12:59:30'</code> is semantically equal to <code class="language-plaintext highlighter-rouge">CAST('2019-12-23 12:59:30' AS TIMESTAMP)</code>. When the input string does not contain information about time zone, the time zone from the SQL config <code class="language-plaintext highlighter-rouge">spark.sql.session.timeZone</code> is used in that case. In Spark version 2.4 and below, the conversion is based on JVM system time zone. The different sources of the default time zone may change the behavior of typed <code class="language-plaintext highlighter-rouge">TIMESTAMP</code> and <code class="language-plaintext highlighter-rouge">DATE</code> literals.</p>
</li>
</ul>
</li>
<li>
<p>In Spark 3.0, <code class="language-plaintext highlighter-rouge">TIMESTAMP</code> literals are converted to strings using the SQL config <code class="language-plaintext highlighter-rouge">spark.sql.session.timeZone</code>. In Spark version 2.4 and below, the conversion uses the default time zone of the Java virtual machine.</p>
</li>
<li>
<p>In Spark 3.0, Spark casts <code class="language-plaintext highlighter-rouge">String</code> to <code class="language-plaintext highlighter-rouge">Date/Timestamp</code> in binary comparisons with dates/timestamps. The previous behavior of casting <code class="language-plaintext highlighter-rouge">Date/Timestamp</code> to <code class="language-plaintext highlighter-rouge">String</code> can be restored by setting <code class="language-plaintext highlighter-rouge">spark.sql.legacy.typeCoercion.datetimeToString.enabled</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>In Spark 3.0, special values are supported in conversion from strings to dates and timestamps. Those values are simply notational shorthands that are converted to ordinary date or timestamp values when read. The following string values are supported for dates:
<ul>
<li><code class="language-plaintext highlighter-rouge">epoch [zoneId]</code> - 1970-01-01</li>
<li><code class="language-plaintext highlighter-rouge">today [zoneId]</code> - the current date in the time zone specified by <code class="language-plaintext highlighter-rouge">spark.sql.session.timeZone</code></li>
<li><code class="language-plaintext highlighter-rouge">yesterday [zoneId]</code> - the current date - 1</li>
<li><code class="language-plaintext highlighter-rouge">tomorrow [zoneId]</code> - the current date + 1</li>
<li><code class="language-plaintext highlighter-rouge">now</code> - the date of running the current query. It has the same notion as today</li>
</ul>
<p>For example <code class="language-plaintext highlighter-rouge">SELECT date 'tomorrow' - date 'yesterday';</code> should output <code class="language-plaintext highlighter-rouge">2</code>. Here are special timestamp values:</p>
<ul>
<li><code class="language-plaintext highlighter-rouge">epoch [zoneId]</code> - 1970-01-01 00:00:00+00 (Unix system time zero)</li>
<li><code class="language-plaintext highlighter-rouge">today [zoneId]</code> - midnight today</li>
<li><code class="language-plaintext highlighter-rouge">yesterday [zoneId]</code> - midnight yesterday</li>
<li><code class="language-plaintext highlighter-rouge">tomorrow [zoneId]</code> - midnight tomorrow</li>
<li><code class="language-plaintext highlighter-rouge">now</code> - current query start time</li>
</ul>
<p>For example <code class="language-plaintext highlighter-rouge">SELECT timestamp 'tomorrow';</code>.</p>
</li>
<li>
<p>Since Spark 3.0, when using <code class="language-plaintext highlighter-rouge">EXTRACT</code> expression to extract the second field from date/timestamp values, the result will be a <code class="language-plaintext highlighter-rouge">DecimalType(8, 6)</code> value with 2 digits for second part, and 6 digits for the fractional part with microsecond precision. e.g. <code class="language-plaintext highlighter-rouge">extract(second from to_timestamp('2019-09-20 10:10:10.1'))</code> results <code class="language-plaintext highlighter-rouge">10.100000</code>. In Spark version 2.4 and earlier, it returns an <code class="language-plaintext highlighter-rouge">IntegerType</code> value and the result for the former example is <code class="language-plaintext highlighter-rouge">10</code>.</p>
</li>
<li>In Spark 3.0, datetime pattern letter <code class="language-plaintext highlighter-rouge">F</code> is <strong>aligned day of week in month</strong> that represents the concept of the count of days within the period of a week where the weeks are aligned to the start of the month. In Spark version 2.4 and earlier, it is <strong>week of month</strong> that represents the concept of the count of weeks within the month where weeks start on a fixed day-of-week, e.g. <code class="language-plaintext highlighter-rouge">2020-07-30</code> is 30 days (4 weeks and 2 days) after the first day of the month, so <code class="language-plaintext highlighter-rouge">date_format(date '2020-07-30', 'F')</code> returns 2 in Spark 3.0, but as a week count in Spark 2.x, it returns 5 because it locates in the 5th week of July 2020, where week one is 2020-07-01 to 07-04.</li>
</ul>
<h3 id="data-sources">Data Sources</h3>
<ul>
<li>
<p>In Spark version 2.4 and below, when reading a Hive SerDe table with Spark native data sources(parquet/orc), Spark infers the actual file schema and update the table schema in metastore. In Spark 3.0, Spark doesn&#8217;t infer the schema anymore. This should not cause any problems to end users, but if it does, set <code class="language-plaintext highlighter-rouge">spark.sql.hive.caseSensitiveInferenceMode</code> to <code class="language-plaintext highlighter-rouge">INFER_AND_SAVE</code>.</p>
</li>
<li>
<p>In Spark version 2.4 and below, partition column value is converted as null if it can&#8217;t be casted to corresponding user provided schema. In 3.0, partition column value is validated with user provided schema. An exception is thrown if the validation fails. You can disable such validation by setting <code class="language-plaintext highlighter-rouge">spark.sql.sources.validatePartitionColumns</code> to <code class="language-plaintext highlighter-rouge">false</code>.</p>
</li>
<li>
<p>In Spark 3.0, if files or subdirectories disappear during recursive directory listing (that is, they appear in an intermediate listing but then cannot be read or listed during later phases of the recursive directory listing, due to either concurrent file deletions or object store consistency issues) then the listing will fail with an exception unless <code class="language-plaintext highlighter-rouge">spark.sql.files.ignoreMissingFiles</code> is <code class="language-plaintext highlighter-rouge">true</code> (default <code class="language-plaintext highlighter-rouge">false</code>). In previous versions, these missing files or subdirectories would be ignored. Note that this change of behavior only applies during initial table file listing (or during <code class="language-plaintext highlighter-rouge">REFRESH TABLE</code>), not during query execution: the net change is that <code class="language-plaintext highlighter-rouge">spark.sql.files.ignoreMissingFiles</code> is now obeyed during table file listing / query planning, not only at query execution time.</p>
</li>
<li>
<p>In Spark version 2.4 and below, the parser of JSON data source treats empty strings as null for some data types such as <code class="language-plaintext highlighter-rouge">IntegerType</code>. For <code class="language-plaintext highlighter-rouge">FloatType</code>, <code class="language-plaintext highlighter-rouge">DoubleType</code>, <code class="language-plaintext highlighter-rouge">DateType</code> and <code class="language-plaintext highlighter-rouge">TimestampType</code>, it fails on empty strings and throws exceptions. Spark 3.0 disallows empty strings and will throw an exception for data types except for <code class="language-plaintext highlighter-rouge">StringType</code> and <code class="language-plaintext highlighter-rouge">BinaryType</code>. The previous behavior of allowing an empty string can be restored by setting <code class="language-plaintext highlighter-rouge">spark.sql.legacy.json.allowEmptyString.enabled</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark version 2.4 and below, JSON datasource and JSON functions like <code class="language-plaintext highlighter-rouge">from_json</code> convert a bad JSON record to a row with all <code class="language-plaintext highlighter-rouge">null</code>s in the PERMISSIVE mode when specified schema is <code class="language-plaintext highlighter-rouge">StructType</code>. In Spark 3.0, the returned row can contain non-<code class="language-plaintext highlighter-rouge">null</code> fields if some of JSON column values were parsed and converted to desired types successfully.</p>
</li>
<li>
<p>In Spark 3.0, JSON datasource and JSON function <code class="language-plaintext highlighter-rouge">schema_of_json</code> infer TimestampType from string values if they match to the pattern defined by the JSON option <code class="language-plaintext highlighter-rouge">timestampFormat</code>. Set JSON option <code class="language-plaintext highlighter-rouge">inferTimestamp</code> to <code class="language-plaintext highlighter-rouge">false</code> to disable such type inference.</p>
</li>
<li>
<p>In Spark version 2.4 and below, CSV datasource converts a malformed CSV string to a row with all <code class="language-plaintext highlighter-rouge">null</code>s in the PERMISSIVE mode. In Spark 3.0, the returned row can contain non-<code class="language-plaintext highlighter-rouge">null</code> fields if some of CSV column values were parsed and converted to desired types successfully.</p>
</li>
<li>
<p>In Spark 3.0, when Avro files are written with user provided schema, the fields are matched by field names between catalyst schema and Avro schema instead of positions.</p>
</li>
<li>
<p>In Spark 3.0, when Avro files are written with user provided non-nullable schema, even the catalyst schema is nullable, Spark is still able to write the files. However, Spark throws runtime NullPointerException if any of the records contains null.</p>
</li>
<li>
<p>In Spark version 2.4 and below, CSV datasource can detect encoding of input files automatically when the files have BOM at the beginning. For instance, CSV datasource can recognize UTF-8, UTF-16BE, UTF-16LE, UTF-32BE and UTF-32LE in the multi-line mode (the CSV option <code class="language-plaintext highlighter-rouge">multiLine</code> is set to <code class="language-plaintext highlighter-rouge">true</code>). In Spark 3.0, CSV datasource reads input files in encoding specified via the CSV option <code class="language-plaintext highlighter-rouge">encoding</code> which has the default value of UTF-8. In this way, if file encoding doesn&#8217;t match to the encoding specified via the CSV option, Spark loads the file incorrectly. To solve the issue, users should either set correct encoding via the CSV option <code class="language-plaintext highlighter-rouge">encoding</code> or set the option to <code class="language-plaintext highlighter-rouge">null</code> which fallbacks to encoding auto-detection as in Spark versions before 3.0.</p>
</li>
</ul>
<h3 id="others">Others</h3>
<ul>
<li>
<p>In Spark version 2.4, when a Spark session is created via <code class="language-plaintext highlighter-rouge">cloneSession()</code>, the newly created Spark session inherits its configuration from its parent <code class="language-plaintext highlighter-rouge">SparkContext</code> even though the same configuration may exist with a different value in its parent Spark session. In Spark 3.0, the configurations of a parent <code class="language-plaintext highlighter-rouge">SparkSession</code> have a higher precedence over the parent <code class="language-plaintext highlighter-rouge">SparkContext</code>. You can restore the old behavior by setting <code class="language-plaintext highlighter-rouge">spark.sql.legacy.sessionInitWithConfigDefaults</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In Spark 3.0, if <code class="language-plaintext highlighter-rouge">hive.default.fileformat</code> is not found in <code class="language-plaintext highlighter-rouge">Spark SQL configuration</code> then it falls back to the <code class="language-plaintext highlighter-rouge">hive-site.xml</code> file present in the <code class="language-plaintext highlighter-rouge">Hadoop configuration</code> of <code class="language-plaintext highlighter-rouge">SparkContext</code>.</p>
</li>
<li>
<p>In Spark 3.0, we pad decimal numbers with trailing zeros to the scale of the column for <code class="language-plaintext highlighter-rouge">spark-sql</code> interface, for example:</p>
<table>
<thead>
<tr>
<th>Query</th>
<th>Spark 2.4</th>
<th>Spark 3.0</th>
</tr>
</thead>
<tbody>
<tr>
<td><code class="language-plaintext highlighter-rouge">SELECT CAST(1 AS decimal(38, 18));</code></td>
<td>1</td>
<td>1.000000000000000000</td>
</tr>
</tbody>
</table>
</li>
<li>
<p>In Spark 3.0, we upgraded the built-in Hive from 1.2 to 2.3 and it brings following impacts:</p>
<ul>
<li>
<p>You may need to set <code class="language-plaintext highlighter-rouge">spark.sql.hive.metastore.version</code> and <code class="language-plaintext highlighter-rouge">spark.sql.hive.metastore.jars</code> according to the version of the Hive metastore you want to connect to. For example: set <code class="language-plaintext highlighter-rouge">spark.sql.hive.metastore.version</code> to <code class="language-plaintext highlighter-rouge">1.2.1</code> and <code class="language-plaintext highlighter-rouge">spark.sql.hive.metastore.jars</code> to <code class="language-plaintext highlighter-rouge">maven</code> if your Hive metastore version is 1.2.1.</p>
</li>
<li>
<p>You need to migrate your custom SerDes to Hive 2.3 or build your own Spark with <code class="language-plaintext highlighter-rouge">hive-1.2</code> profile. See <a href="https://issues.apache.org/jira/browse/HIVE-15167">HIVE-15167</a> for more details.</p>
</li>
<li>
<p>The decimal string representation can be different between Hive 1.2 and Hive 2.3 when using <code class="language-plaintext highlighter-rouge">TRANSFORM</code> operator in SQL for script transformation, which depends on hive&#8217;s behavior. In Hive 1.2, the string representation omits trailing zeroes. But in Hive 2.3, it is always padded to 18 digits with trailing zeroes if necessary.</p>
</li>
</ul>
</li>
</ul>
<h2 id="upgrading-from-spark-sql-247-to-248">Upgrading from Spark SQL 2.4.7 to 2.4.8</h2>
<ul>
<li>In Spark 2.4.8, <code class="language-plaintext highlighter-rouge">AnalysisException</code> is replaced by its sub-classes that are thrown for tables from Hive external catalog in the following situations:
<ul>
<li><code class="language-plaintext highlighter-rouge">ALTER TABLE .. ADD PARTITION</code> throws <code class="language-plaintext highlighter-rouge">PartitionsAlreadyExistException</code> if new partition exists already</li>
<li><code class="language-plaintext highlighter-rouge">ALTER TABLE .. DROP PARTITION</code> throws <code class="language-plaintext highlighter-rouge">NoSuchPartitionsException</code> for not existing partitions</li>
</ul>
</li>
</ul>
<h2 id="upgrading-from-spark-sql-245-to-246">Upgrading from Spark SQL 2.4.5 to 2.4.6</h2>
<ul>
<li>In Spark 2.4.6, the <code class="language-plaintext highlighter-rouge">RESET</code> command does not reset the static SQL configuration values to the default. It only clears the runtime SQL configuration values.</li>
</ul>
<h2 id="upgrading-from-spark-sql-244-to-245">Upgrading from Spark SQL 2.4.4 to 2.4.5</h2>
<ul>
<li>
<p>Since Spark 2.4.5, <code class="language-plaintext highlighter-rouge">TRUNCATE TABLE</code> command tries to set back original permission and ACLs during re-creating the table/partition paths. To restore the behaviour of earlier versions, set <code class="language-plaintext highlighter-rouge">spark.sql.truncateTable.ignorePermissionAcl.enabled</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>Since Spark 2.4.5, <code class="language-plaintext highlighter-rouge">spark.sql.legacy.mssqlserver.numericMapping.enabled</code> configuration is added in order to support the legacy MsSQLServer dialect mapping behavior using IntegerType and DoubleType for SMALLINT and REAL JDBC types, respectively. To restore the behaviour of 2.4.3 and earlier versions, set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.mssqlserver.numericMapping.enabled</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
</ul>
<h2 id="upgrading-from-spark-sql-243-to-244">Upgrading from Spark SQL 2.4.3 to 2.4.4</h2>
<ul>
<li>Since Spark 2.4.4, according to <a href="https://docs.microsoft.com/en-us/sql/connect/jdbc/using-basic-data-types?view=sql-server-2017">MsSqlServer Guide</a>, MsSQLServer JDBC Dialect uses ShortType and FloatType for SMALLINT and REAL, respectively. Previously, IntegerType and DoubleType is used.</li>
</ul>
<h2 id="upgrading-from-spark-sql-24-to-241">Upgrading from Spark SQL 2.4 to 2.4.1</h2>
<ul>
<li>
<p>The value of <code class="language-plaintext highlighter-rouge">spark.executor.heartbeatInterval</code>, when specified without units like &#8220;30&#8221; rather than &#8220;30s&#8221;, was
inconsistently interpreted as both seconds and milliseconds in Spark 2.4.0 in different parts of the code.
Unitless values are now consistently interpreted as milliseconds. Applications that set values like &#8220;30&#8221;
need to specify a value with units like &#8220;30s&#8221; now, to avoid being interpreted as milliseconds; otherwise,
the extremely short interval that results will likely cause applications to fail.</p>
</li>
<li>
<p>When turning a Dataset to another Dataset, Spark will up cast the fields in the original Dataset to the type of corresponding fields in the target DataSet. In version 2.4 and earlier, this up cast is not very strict, e.g. <code class="language-plaintext highlighter-rouge">Seq("str").toDS.as[Int]</code> fails, but <code class="language-plaintext highlighter-rouge">Seq("str").toDS.as[Boolean]</code> works and throw NPE during execution. In Spark 3.0, the up cast is stricter and turning String into something else is not allowed, i.e. <code class="language-plaintext highlighter-rouge">Seq("str").toDS.as[Boolean]</code> will fail during analysis. To restore the behavior before 2.4.1, set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.looseUpcast</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
</ul>
<h2 id="upgrading-from-spark-sql-23-to-24">Upgrading from Spark SQL 2.3 to 2.4</h2>
<ul>
<li>In Spark version 2.3 and earlier, the second parameter to array_contains function is implicitly promoted to the element type of first array type parameter. This type promotion can be lossy and may cause <code class="language-plaintext highlighter-rouge">array_contains</code> function to return wrong result. This problem has been addressed in 2.4 by employing a safer type promotion mechanism. This can cause some change in behavior and are illustrated in the table below.
<table class="table">
<tr>
<th>
<b>Query</b>
</th>
<th>
<b>Spark 2.3 or Prior</b>
</th>
<th>
<b>Spark 2.4</b>
</th>
<th>
<b>Remarks</b>
</th>
</tr>
<tr>
<td>
<code>SELECT array_contains(array(1), 1.34D);</code>
</td>
<td>
<code>true</code>
</td>
<td>
<code>false</code>
</td>
<td>
In Spark 2.4, left and right parameters are promoted to array type of double type and double type respectively.
</td>
</tr>
<tr>
<td>
<code>SELECT array_contains(array(1), '1');</code>
</td>
<td>
<code>true</code>
</td>
<td>
<code>AnalysisException</code> is thrown.
</td>
<td>
Explicit cast can be used in arguments to avoid the exception. In Spark 2.4, <code>AnalysisException</code> is thrown since integer type can not be promoted to string type in a loss-less manner.
</td>
</tr>
<tr>
<td>
<code>SELECT array_contains(array(1), 'anystring');</code>
</td>
<td>
<code>null</code>
</td>
<td>
<code>AnalysisException</code> is thrown.
</td>
<td>
Explicit cast can be used in arguments to avoid the exception. In Spark 2.4, <code>AnalysisException</code> is thrown since integer type can not be promoted to string type in a loss-less manner.
</td>
</tr>
</table>
</li>
<li>
<p>Since Spark 2.4, when there is a struct field in front of the IN operator before a subquery, the inner query must contain a struct field as well. In previous versions, instead, the fields of the struct were compared to the output of the inner query. For example, if <code class="language-plaintext highlighter-rouge">a</code> is a <code class="language-plaintext highlighter-rouge">struct(a string, b int)</code>, in Spark 2.4 <code class="language-plaintext highlighter-rouge">a in (select (1 as a, 'a' as b) from range(1))</code> is a valid query, while <code class="language-plaintext highlighter-rouge">a in (select 1, 'a' from range(1))</code> is not. In previous version it was the opposite.</p>
</li>
<li>
<p>In versions 2.2.1+ and 2.3, if <code class="language-plaintext highlighter-rouge">spark.sql.caseSensitive</code> is set to true, then the <code class="language-plaintext highlighter-rouge">CURRENT_DATE</code> and <code class="language-plaintext highlighter-rouge">CURRENT_TIMESTAMP</code> functions incorrectly became case-sensitive and would resolve to columns (unless typed in lower case). In Spark 2.4 this has been fixed and the functions are no longer case-sensitive.</p>
</li>
<li>
<p>Since Spark 2.4, Spark will evaluate the set operations referenced in a query by following a precedence rule as per the SQL standard. If the order is not specified by parentheses, set operations are performed from left to right with the exception that all INTERSECT operations are performed before any UNION, EXCEPT or MINUS operations. The old behaviour of giving equal precedence to all the set operations are preserved under a newly added configuration <code class="language-plaintext highlighter-rouge">spark.sql.legacy.setopsPrecedence.enabled</code> with a default value of <code class="language-plaintext highlighter-rouge">false</code>. When this property is set to <code class="language-plaintext highlighter-rouge">true</code>, spark will evaluate the set operators from left to right as they appear in the query given no explicit ordering is enforced by usage of parenthesis.</p>
</li>
<li>
<p>Since Spark 2.4, Spark will display table description column Last Access value as UNKNOWN when the value was Jan 01 1970.</p>
</li>
<li>
<p>Since Spark 2.4, Spark maximizes the usage of a vectorized ORC reader for ORC files by default. To do that, <code class="language-plaintext highlighter-rouge">spark.sql.orc.impl</code> and <code class="language-plaintext highlighter-rouge">spark.sql.orc.filterPushdown</code> change their default values to <code class="language-plaintext highlighter-rouge">native</code> and <code class="language-plaintext highlighter-rouge">true</code> respectively. ORC files created by native ORC writer cannot be read by some old Apache Hive releases. Use <code class="language-plaintext highlighter-rouge">spark.sql.orc.impl=hive</code> to create the files shared with Hive 2.1.1 and older.</p>
</li>
<li>
<p>Since Spark 2.4, writing an empty dataframe to a directory launches at least one write task, even if physically the dataframe has no partition. This introduces a small behavior change that for self-describing file formats like Parquet and Orc, Spark creates a metadata-only file in the target directory when writing a 0-partition dataframe, so that schema inference can still work if users read that directory later. The new behavior is more reasonable and more consistent regarding writing empty dataframe.</p>
</li>
<li>
<p>Since Spark 2.4, expression IDs in UDF arguments do not appear in column names. For example, a column name in Spark 2.4 is not <code class="language-plaintext highlighter-rouge">UDF:f(col0 AS colA#28)</code> but <code class="language-plaintext highlighter-rouge">UDF:f(col0 AS `colA`)</code>.</p>
</li>
<li>
<p>Since Spark 2.4, writing a dataframe with an empty or nested empty schema using any file formats (parquet, orc, json, text, csv etc.) is not allowed. An exception is thrown when attempting to write dataframes with empty schema.</p>
</li>
<li>
<p>Since Spark 2.4, Spark compares a DATE type with a TIMESTAMP type after promotes both sides to TIMESTAMP. To set <code class="language-plaintext highlighter-rouge">false</code> to <code class="language-plaintext highlighter-rouge">spark.sql.legacy.compareDateTimestampInTimestamp</code> restores the previous behavior. This option will be removed in Spark 3.0.</p>
</li>
<li>
<p>Since Spark 2.4, creating a managed table with nonempty location is not allowed. An exception is thrown when attempting to create a managed table with nonempty location. To set <code class="language-plaintext highlighter-rouge">true</code> to <code class="language-plaintext highlighter-rouge">spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation</code> restores the previous behavior. This option will be removed in Spark 3.0.</p>
</li>
<li>
<p>Since Spark 2.4, renaming a managed table to existing location is not allowed. An exception is thrown when attempting to rename a managed table to existing location.</p>
</li>
<li>
<p>Since Spark 2.4, the type coercion rules can automatically promote the argument types of the variadic SQL functions (e.g., IN/COALESCE) to the widest common type, no matter how the input arguments order. In prior Spark versions, the promotion could fail in some specific orders (e.g., TimestampType, IntegerType and StringType) and throw an exception.</p>
</li>
<li>
<p>Since Spark 2.4, Spark has enabled non-cascading SQL cache invalidation in addition to the traditional cache invalidation mechanism. The non-cascading cache invalidation mechanism allows users to remove a cache without impacting its dependent caches. This new cache invalidation mechanism is used in scenarios where the data of the cache to be removed is still valid, e.g., calling unpersist() on a Dataset, or dropping a temporary view. This allows users to free up memory and keep the desired caches valid at the same time.</p>
</li>
<li>
<p>In version 2.3 and earlier, Spark converts Parquet Hive tables by default but ignores table properties like <code class="language-plaintext highlighter-rouge">TBLPROPERTIES (parquet.compression 'NONE')</code>. This happens for ORC Hive table properties like <code class="language-plaintext highlighter-rouge">TBLPROPERTIES (orc.compress 'NONE')</code> in case of <code class="language-plaintext highlighter-rouge">spark.sql.hive.convertMetastoreOrc=true</code>, too. Since Spark 2.4, Spark respects Parquet/ORC specific table properties while converting Parquet/ORC Hive tables. As an example, <code class="language-plaintext highlighter-rouge">CREATE TABLE t(id int) STORED AS PARQUET TBLPROPERTIES (parquet.compression 'NONE')</code> would generate Snappy parquet files during insertion in Spark 2.3, and in Spark 2.4, the result would be uncompressed parquet files.</p>
</li>
<li>
<p>Since Spark 2.0, Spark converts Parquet Hive tables by default for better performance. Since Spark 2.4, Spark converts ORC Hive tables by default, too. It means Spark uses its own ORC support by default instead of Hive SerDe. As an example, <code class="language-plaintext highlighter-rouge">CREATE TABLE t(id int) STORED AS ORC</code> would be handled with Hive SerDe in Spark 2.3, and in Spark 2.4, it would be converted into Spark&#8217;s ORC data source table and ORC vectorization would be applied. To set <code class="language-plaintext highlighter-rouge">false</code> to <code class="language-plaintext highlighter-rouge">spark.sql.hive.convertMetastoreOrc</code> restores the previous behavior.</p>
</li>
<li>
<p>In version 2.3 and earlier, CSV rows are considered as malformed if at least one column value in the row is malformed. CSV parser dropped such rows in the DROPMALFORMED mode or outputs an error in the FAILFAST mode. Since Spark 2.4, CSV row is considered as malformed only when it contains malformed column values requested from CSV datasource, other values can be ignored. As an example, CSV file contains the &#8220;id,name&#8221; header and one row &#8220;1234&#8221;. In Spark 2.4, selection of the id column consists of a row with one column value 1234 but in Spark 2.3 and earlier it is empty in the DROPMALFORMED mode. To restore the previous behavior, set <code class="language-plaintext highlighter-rouge">spark.sql.csv.parser.columnPruning.enabled</code> to <code class="language-plaintext highlighter-rouge">false</code>.</p>
</li>
<li>
<p>Since Spark 2.4, File listing for compute statistics is done in parallel by default. This can be disabled by setting <code class="language-plaintext highlighter-rouge">spark.sql.statistics.parallelFileListingInStatsComputation.enabled</code> to <code class="language-plaintext highlighter-rouge">False</code>.</p>
</li>
<li>
<p>Since Spark 2.4, Metadata files (e.g. Parquet summary files) and temporary files are not counted as data files when calculating table size during Statistics computation.</p>
</li>
<li>
<p>Since Spark 2.4, empty strings are saved as quoted empty strings <code class="language-plaintext highlighter-rouge">""</code>. In version 2.3 and earlier, empty strings are equal to <code class="language-plaintext highlighter-rouge">null</code> values and do not reflect to any characters in saved CSV files. For example, the row of <code class="language-plaintext highlighter-rouge">"a", null, "", 1</code> was written as <code class="language-plaintext highlighter-rouge">a,,,1</code>. Since Spark 2.4, the same row is saved as <code class="language-plaintext highlighter-rouge">a,,"",1</code>. To restore the previous behavior, set the CSV option <code class="language-plaintext highlighter-rouge">emptyValue</code> to empty (not quoted) string.</p>
</li>
<li>
<p>Since Spark 2.4, The LOAD DATA command supports wildcard <code class="language-plaintext highlighter-rouge">?</code> and <code class="language-plaintext highlighter-rouge">*</code>, which match any one character, and zero or more characters, respectively. Example: <code class="language-plaintext highlighter-rouge">LOAD DATA INPATH '/tmp/folder*/'</code> or <code class="language-plaintext highlighter-rouge">LOAD DATA INPATH '/tmp/part-?'</code>. Special Characters like <code class="language-plaintext highlighter-rouge">space</code> also now work in paths. Example: <code class="language-plaintext highlighter-rouge">LOAD DATA INPATH '/tmp/folder name/'</code>.</p>
</li>
<li>
<p>In Spark version 2.3 and earlier, HAVING without GROUP BY is treated as WHERE. This means, <code class="language-plaintext highlighter-rouge">SELECT 1 FROM range(10) HAVING true</code> is executed as <code class="language-plaintext highlighter-rouge">SELECT 1 FROM range(10) WHERE true</code> and returns 10 rows. This violates SQL standard, and has been fixed in Spark 2.4. Since Spark 2.4, HAVING without GROUP BY is treated as a global aggregate, which means <code class="language-plaintext highlighter-rouge">SELECT 1 FROM range(10) HAVING true</code> will return only one row. To restore the previous behavior, set <code class="language-plaintext highlighter-rouge">spark.sql.legacy.parser.havingWithoutGroupByAsWhere</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>In version 2.3 and earlier, when reading from a Parquet data source table, Spark always returns null for any column whose column names in Hive metastore schema and Parquet schema are in different letter cases, no matter whether <code class="language-plaintext highlighter-rouge">spark.sql.caseSensitive</code> is set to <code class="language-plaintext highlighter-rouge">true</code> or <code class="language-plaintext highlighter-rouge">false</code>. Since 2.4, when <code class="language-plaintext highlighter-rouge">spark.sql.caseSensitive</code> is set to <code class="language-plaintext highlighter-rouge">false</code>, Spark does case insensitive column name resolution between Hive metastore schema and Parquet schema, so even column names are in different letter cases, Spark returns corresponding column values. An exception is thrown if there is ambiguity, i.e. more than one Parquet column is matched. This change also applies to Parquet Hive tables when <code class="language-plaintext highlighter-rouge">spark.sql.hive.convertMetastoreParquet</code> is set to <code class="language-plaintext highlighter-rouge">true</code>.</li>
</ul>
<h2 id="upgrading-from-spark-sql-22-to-23">Upgrading from Spark SQL 2.2 to 2.3</h2>
<ul>
<li>
<p>Since Spark 2.3, the queries from raw JSON/CSV files are disallowed when the referenced columns only include the internal corrupt record column (named <code class="language-plaintext highlighter-rouge">_corrupt_record</code> by default). For example, <code class="language-plaintext highlighter-rouge">spark.read.schema(schema).json(file).filter($"_corrupt_record".isNotNull).count()</code> and <code class="language-plaintext highlighter-rouge">spark.read.schema(schema).json(file).select("_corrupt_record").show()</code>. Instead, you can cache or save the parsed results and then send the same query. For example, <code class="language-plaintext highlighter-rouge">val df = spark.read.schema(schema).json(file).cache()</code> and then <code class="language-plaintext highlighter-rouge">df.filter($"_corrupt_record".isNotNull).count()</code>.</p>
</li>
<li>
<p>The <code class="language-plaintext highlighter-rouge">percentile_approx</code> function previously accepted numeric type input and output double type results. Now it supports date type, timestamp type and numeric types as input types. The result type is also changed to be the same as the input type, which is more reasonable for percentiles.</p>
</li>
<li>
<p>Since Spark 2.3, the Join/Filter&#8217;s deterministic predicates that are after the first non-deterministic predicates are also pushed down/through the child operators, if possible. In prior Spark versions, these filters are not eligible for predicate pushdown.</p>
</li>
<li>Partition column inference previously found incorrect common type for different inferred types, for example, previously it ended up with double type as the common type for double type and date type. Now it finds the correct common type for such conflicts. The conflict resolution follows the table below:
<table class="table">
<tr>
<th>
<b>InputA \ InputB</b>
</th>
<th>
<b>NullType</b>
</th>
<th>
<b>IntegerType</b>
</th>
<th>
<b>LongType</b>
</th>
<th>
<b>DecimalType(38,0)*</b>
</th>
<th>
<b>DoubleType</b>
</th>
<th>
<b>DateType</b>
</th>
<th>
<b>TimestampType</b>
</th>
<th>
<b>StringType</b>
</th>
</tr>
<tr>
<td>
<b>NullType</b>
</td>
<td>NullType</td>
<td>IntegerType</td>
<td>LongType</td>
<td>DecimalType(38,0)</td>
<td>DoubleType</td>
<td>DateType</td>
<td>TimestampType</td>
<td>StringType</td>
</tr>
<tr>
<td>
<b>IntegerType</b>
</td>
<td>IntegerType</td>
<td>IntegerType</td>
<td>LongType</td>
<td>DecimalType(38,0)</td>
<td>DoubleType</td>
<td>StringType</td>
<td>StringType</td>
<td>StringType</td>
</tr>
<tr>
<td>
<b>LongType</b>
</td>
<td>LongType</td>
<td>LongType</td>
<td>LongType</td>
<td>DecimalType(38,0)</td>
<td>StringType</td>
<td>StringType</td>
<td>StringType</td>
<td>StringType</td>
</tr>
<tr>
<td>
<b>DecimalType(38,0)*</b>
</td>
<td>DecimalType(38,0)</td>
<td>DecimalType(38,0)</td>
<td>DecimalType(38,0)</td>
<td>DecimalType(38,0)</td>
<td>StringType</td>
<td>StringType</td>
<td>StringType</td>
<td>StringType</td>
</tr>
<tr>
<td>
<b>DoubleType</b>
</td>
<td>DoubleType</td>
<td>DoubleType</td>
<td>StringType</td>
<td>StringType</td>
<td>DoubleType</td>
<td>StringType</td>
<td>StringType</td>
<td>StringType</td>
</tr>
<tr>
<td>
<b>DateType</b>
</td>
<td>DateType</td>
<td>StringType</td>
<td>StringType</td>
<td>StringType</td>
<td>StringType</td>
<td>DateType</td>
<td>TimestampType</td>
<td>StringType</td>
</tr>
<tr>
<td>
<b>TimestampType</b>
</td>
<td>TimestampType</td>
<td>StringType</td>
<td>StringType</td>
<td>StringType</td>
<td>StringType</td>
<td>TimestampType</td>
<td>TimestampType</td>
<td>StringType</td>
</tr>
<tr>
<td>
<b>StringType</b>
</td>
<td>StringType</td>
<td>StringType</td>
<td>StringType</td>
<td>StringType</td>
<td>StringType</td>
<td>StringType</td>
<td>StringType</td>
<td>StringType</td>
</tr>
</table>
<p>Note that, for <b>DecimalType(38,0)*</b>, the table above intentionally does not cover all other combinations of scales and precisions because currently we only infer decimal type like <code class="language-plaintext highlighter-rouge">BigInteger</code>/<code class="language-plaintext highlighter-rouge">BigInt</code>. For example, 1.1 is inferred as double type.</p>
</li>
<li>
<p>Since Spark 2.3, when either broadcast hash join or broadcast nested loop join is applicable, we prefer to broadcasting the table that is explicitly specified in a broadcast hint. For details, see the section <a href="sql-performance-tuning.html#join-strategy-hints-for-sql-queries">Join Strategy Hints for SQL Queries</a> and <a href="https://issues.apache.org/jira/browse/SPARK-22489">SPARK-22489</a>.</p>
</li>
<li>
<p>Since Spark 2.3, when all inputs are binary, <code class="language-plaintext highlighter-rouge">functions.concat()</code> returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set <code class="language-plaintext highlighter-rouge">spark.sql.function.concatBinaryAsString</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>Since Spark 2.3, when all inputs are binary, SQL <code class="language-plaintext highlighter-rouge">elt()</code> returns an output as binary. Otherwise, it returns as a string. Until Spark 2.3, it always returns as a string despite of input types. To keep the old behavior, set <code class="language-plaintext highlighter-rouge">spark.sql.function.eltOutputAsString</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>Since Spark 2.3, by default arithmetic operations between decimals return a rounded value if an exact representation is not possible (instead of returning NULL). This is compliant with SQL ANSI 2011 specification and Hive&#8217;s new behavior introduced in Hive 2.2 (HIVE-15331). This involves the following changes</p>
<ul>
<li>
<p>The rules to determine the result type of an arithmetic operation have been updated. In particular, if the precision / scale needed are out of the range of available values, the scale is reduced up to 6, in order to prevent the truncation of the integer part of the decimals. All the arithmetic operations are affected by the change, i.e. addition (<code class="language-plaintext highlighter-rouge">+</code>), subtraction (<code class="language-plaintext highlighter-rouge">-</code>), multiplication (<code class="language-plaintext highlighter-rouge">*</code>), division (<code class="language-plaintext highlighter-rouge">/</code>), remainder (<code class="language-plaintext highlighter-rouge">%</code>) and positive modulus (<code class="language-plaintext highlighter-rouge">pmod</code>).</p>
</li>
<li>
<p>Literal values used in SQL operations are converted to DECIMAL with the exact precision and scale needed by them.</p>
</li>
<li>
<p>The configuration <code class="language-plaintext highlighter-rouge">spark.sql.decimalOperations.allowPrecisionLoss</code> has been introduced. It defaults to <code class="language-plaintext highlighter-rouge">true</code>, which means the new behavior described here; if set to <code class="language-plaintext highlighter-rouge">false</code>, Spark uses previous rules, i.e. it doesn&#8217;t adjust the needed scale to represent the values and it returns NULL if an exact representation of the value is not possible.</p>
</li>
</ul>
</li>
<li>
<p>Un-aliased subquery&#8217;s semantic has not been well defined with confusing behaviors. Since Spark 2.3, we invalidate such confusing cases, for example: <code class="language-plaintext highlighter-rouge">SELECT v.i from (SELECT i FROM v)</code>, Spark will throw an analysis exception in this case because users should not be able to use the qualifier inside a subquery. See <a href="https://issues.apache.org/jira/browse/SPARK-20690">SPARK-20690</a> and <a href="https://issues.apache.org/jira/browse/SPARK-21335">SPARK-21335</a> for more details.</p>
</li>
<li>When creating a <code class="language-plaintext highlighter-rouge">SparkSession</code> with <code class="language-plaintext highlighter-rouge">SparkSession.builder.getOrCreate()</code>, if there is an existing <code class="language-plaintext highlighter-rouge">SparkContext</code>, the builder was trying to update the <code class="language-plaintext highlighter-rouge">SparkConf</code> of the existing <code class="language-plaintext highlighter-rouge">SparkContext</code> with configurations specified to the builder, but the <code class="language-plaintext highlighter-rouge">SparkContext</code> is shared by all <code class="language-plaintext highlighter-rouge">SparkSession</code>s, so we should not update them. Since 2.3, the builder comes to not update the configurations. If you want to update them, you need to update them prior to creating a <code class="language-plaintext highlighter-rouge">SparkSession</code>.</li>
</ul>
<h2 id="upgrading-from-spark-sql-21-to-22">Upgrading from Spark SQL 2.1 to 2.2</h2>
<ul>
<li>
<p>Spark 2.1.1 introduced a new configuration key: <code class="language-plaintext highlighter-rouge">spark.sql.hive.caseSensitiveInferenceMode</code>. It had a default setting of <code class="language-plaintext highlighter-rouge">NEVER_INFER</code>, which kept behavior identical to 2.1.0. However, Spark 2.2.0 changes this setting&#8217;s default value to <code class="language-plaintext highlighter-rouge">INFER_AND_SAVE</code> to restore compatibility with reading Hive metastore tables whose underlying file schema have mixed-case column names. With the <code class="language-plaintext highlighter-rouge">INFER_AND_SAVE</code> configuration value, on first access Spark will perform schema inference on any Hive metastore table for which it has not already saved an inferred schema. Note that schema inference can be a very time-consuming operation for tables with thousands of partitions. If compatibility with mixed-case column names is not a concern, you can safely set <code class="language-plaintext highlighter-rouge">spark.sql.hive.caseSensitiveInferenceMode</code> to <code class="language-plaintext highlighter-rouge">NEVER_INFER</code> to avoid the initial overhead of schema inference. Note that with the new default <code class="language-plaintext highlighter-rouge">INFER_AND_SAVE</code> setting, the results of the schema inference are saved as a metastore key for future use. Therefore, the initial schema inference occurs only at a table&#8217;s first access.</p>
</li>
<li>
<p>Since Spark 2.2.1 and 2.3.0, the schema is always inferred at runtime when the data source tables have the columns that exist in both partition schema and data schema. The inferred schema does not have the partitioned columns. When reading the table, Spark respects the partition values of these overlapping columns instead of the values stored in the data source files. In 2.2.0 and 2.1.x release, the inferred schema is partitioned but the data of the table is invisible to users (i.e., the result set is empty).</p>
</li>
<li>
<p>Since Spark 2.2, view definitions are stored in a different way from prior versions. This may cause Spark unable to read views created by prior versions. In such cases, you need to recreate the views using <code class="language-plaintext highlighter-rouge">ALTER VIEW AS</code> or <code class="language-plaintext highlighter-rouge">CREATE OR REPLACE VIEW AS</code> with newer Spark versions.</p>
</li>
</ul>
<h2 id="upgrading-from-spark-sql-20-to-21">Upgrading from Spark SQL 2.0 to 2.1</h2>
<ul>
<li>
<p>Datasource tables now store partition metadata in the Hive metastore. This means that Hive DDLs such as <code class="language-plaintext highlighter-rouge">ALTER TABLE PARTITION ... SET LOCATION</code> are now available for tables created with the Datasource API.</p>
<ul>
<li>
<p>Legacy datasource tables can be migrated to this format via the <code class="language-plaintext highlighter-rouge">MSCK REPAIR TABLE</code> command. Migrating legacy tables is recommended to take advantage of Hive DDL support and improved planning performance.</p>
</li>
<li>
<p>To determine if a table has been migrated, look for the <code class="language-plaintext highlighter-rouge">PartitionProvider: Catalog</code> attribute when issuing <code class="language-plaintext highlighter-rouge">DESCRIBE FORMATTED</code> on the table.</p>
</li>
</ul>
</li>
<li>
<p>Changes to <code class="language-plaintext highlighter-rouge">INSERT OVERWRITE TABLE ... PARTITION ...</code> behavior for Datasource tables.</p>
<ul>
<li>
<p>In prior Spark versions <code class="language-plaintext highlighter-rouge">INSERT OVERWRITE</code> overwrote the entire Datasource table, even when given a partition specification. Now only partitions matching the specification are overwritten.</p>
</li>
<li>
<p>Note that this still differs from the behavior of Hive tables, which is to overwrite only partitions overlapping with newly inserted data.</p>
</li>
</ul>
</li>
</ul>
<h2 id="upgrading-from-spark-sql-16-to-20">Upgrading from Spark SQL 1.6 to 2.0</h2>
<ul>
<li>
<p><code class="language-plaintext highlighter-rouge">SparkSession</code> is now the new entry point of Spark that replaces the old <code class="language-plaintext highlighter-rouge">SQLContext</code> and</p>
<p><code class="language-plaintext highlighter-rouge">HiveContext</code>. Note that the old SQLContext and HiveContext are kept for backward compatibility. A new <code class="language-plaintext highlighter-rouge">catalog</code> interface is accessible from <code class="language-plaintext highlighter-rouge">SparkSession</code> - existing API on databases and tables access such as <code class="language-plaintext highlighter-rouge">listTables</code>, <code class="language-plaintext highlighter-rouge">createExternalTable</code>, <code class="language-plaintext highlighter-rouge">dropTempView</code>, <code class="language-plaintext highlighter-rouge">cacheTable</code> are moved here.</p>
</li>
<li>
<p>Dataset API and DataFrame API are unified. In Scala, <code class="language-plaintext highlighter-rouge">DataFrame</code> becomes a type alias for
<code class="language-plaintext highlighter-rouge">Dataset[Row]</code>, while Java API users must replace <code class="language-plaintext highlighter-rouge">DataFrame</code> with <code class="language-plaintext highlighter-rouge">Dataset&lt;Row&gt;</code>. Both the typed
transformations (e.g., <code class="language-plaintext highlighter-rouge">map</code>, <code class="language-plaintext highlighter-rouge">filter</code>, and <code class="language-plaintext highlighter-rouge">groupByKey</code>) and untyped transformations (e.g.,
<code class="language-plaintext highlighter-rouge">select</code> and <code class="language-plaintext highlighter-rouge">groupBy</code>) are available on the Dataset class. Since compile-time type-safety in
Python and R is not a language feature, the concept of Dataset does not apply to these languages’
APIs. Instead, <code class="language-plaintext highlighter-rouge">DataFrame</code> remains the primary programming abstraction, which is analogous to the
single-node data frame notion in these languages.</p>
</li>
<li>
<p>Dataset and DataFrame API <code class="language-plaintext highlighter-rouge">unionAll</code> has been deprecated and replaced by <code class="language-plaintext highlighter-rouge">union</code></p>
</li>
<li>
<p>Dataset and DataFrame API <code class="language-plaintext highlighter-rouge">explode</code> has been deprecated, alternatively, use <code class="language-plaintext highlighter-rouge">functions.explode()</code> with <code class="language-plaintext highlighter-rouge">select</code> or <code class="language-plaintext highlighter-rouge">flatMap</code></p>
</li>
<li>
<p>Dataset and DataFrame API <code class="language-plaintext highlighter-rouge">registerTempTable</code> has been deprecated and replaced by <code class="language-plaintext highlighter-rouge">createOrReplaceTempView</code></p>
</li>
<li>
<p>Changes to <code class="language-plaintext highlighter-rouge">CREATE TABLE ... LOCATION</code> behavior for Hive tables.</p>
<ul>
<li>
<p>From Spark 2.0, <code class="language-plaintext highlighter-rouge">CREATE TABLE ... LOCATION</code> is equivalent to <code class="language-plaintext highlighter-rouge">CREATE EXTERNAL TABLE ... LOCATION</code>
in order to prevent accidental dropping the existing data in the user-provided locations.
That means, a Hive table created in Spark SQL with the user-specified location is always a Hive external table.
Dropping external tables will not remove the data. Users are not allowed to specify the location for Hive managed tables.
Note that this is different from the Hive behavior.</p>
</li>
<li>
<p>As a result, <code class="language-plaintext highlighter-rouge">DROP TABLE</code> statements on those tables will not remove the data.</p>
</li>
</ul>
</li>
<li>
<p><code class="language-plaintext highlighter-rouge">spark.sql.parquet.cacheMetadata</code> is no longer used.
See <a href="https://issues.apache.org/jira/browse/SPARK-13664">SPARK-13664</a> for details.</p>
</li>
</ul>
<h2 id="upgrading-from-spark-sql-15-to-16">Upgrading from Spark SQL 1.5 to 1.6</h2>
<ul>
<li>From Spark 1.6, by default, the Thrift server runs in multi-session mode. Which means each JDBC/ODBC
connection owns a copy of their own SQL configuration and temporary function registry. Cached
tables are still shared though. If you prefer to run the Thrift server in the old single-session
mode, please set option <code class="language-plaintext highlighter-rouge">spark.sql.hive.thriftServer.singleSession</code> to <code class="language-plaintext highlighter-rouge">true</code>. You may either add
this option to <code class="language-plaintext highlighter-rouge">spark-defaults.conf</code>, or pass it to <code class="language-plaintext highlighter-rouge">start-thriftserver.sh</code> via <code class="language-plaintext highlighter-rouge">--conf</code>:</li>
</ul>
<figure class="highlight"><pre><code class="language-bash" data-lang="bash"> ./sbin/start-thriftserver.sh <span class="se">\</span>
<span class="nt">--conf</span> spark.sql.hive.thriftServer.singleSession<span class="o">=</span><span class="nb">true</span> <span class="se">\</span>
...
</code></pre></figure>
<ul>
<li>From Spark 1.6, LongType casts to TimestampType expect seconds instead of microseconds. This
change was made to match the behavior of Hive 1.2 for more consistent type casting to TimestampType
from numeric types. See <a href="https://issues.apache.org/jira/browse/SPARK-11724">SPARK-11724</a> for
details.</li>
</ul>
<h2 id="upgrading-from-spark-sql-14-to-15">Upgrading from Spark SQL 1.4 to 1.5</h2>
<ul>
<li>
<p>Optimized execution using manually managed memory (Tungsten) is now enabled by default, along with
code generation for expression evaluation. These features can both be disabled by setting
<code class="language-plaintext highlighter-rouge">spark.sql.tungsten.enabled</code> to <code class="language-plaintext highlighter-rouge">false</code>.</p>
</li>
<li>
<p>Parquet schema merging is no longer enabled by default. It can be re-enabled by setting
<code class="language-plaintext highlighter-rouge">spark.sql.parquet.mergeSchema</code> to <code class="language-plaintext highlighter-rouge">true</code>.</p>
</li>
<li>
<p>In-memory columnar storage partition pruning is on by default. It can be disabled by setting
<code class="language-plaintext highlighter-rouge">spark.sql.inMemoryColumnarStorage.partitionPruning</code> to <code class="language-plaintext highlighter-rouge">false</code>.</p>
</li>
<li>
<p>Unlimited precision decimal columns are no longer supported, instead Spark SQL enforces a maximum
precision of 38. When inferring schema from <code class="language-plaintext highlighter-rouge">BigDecimal</code> objects, a precision of (38, 18) is now
used. When no precision is specified in DDL then the default remains <code class="language-plaintext highlighter-rouge">Decimal(10, 0)</code>.</p>
</li>
<li>
<p>Timestamps are now stored at a precision of 1us, rather than 1ns</p>
</li>
<li>
<p>In the <code class="language-plaintext highlighter-rouge">sql</code> dialect, floating point numbers are now parsed as decimal. HiveQL parsing remains
unchanged.</p>
</li>
<li>
<p>The canonical name of SQL/DataFrame functions are now lower case (e.g., sum vs SUM).</p>
</li>
<li>
<p>JSON data source will not automatically load new files that are created by other applications
(i.e. files that are not inserted to the dataset through Spark SQL).
For a JSON persistent table (i.e. the metadata of the table is stored in Hive Metastore),
users can use <code class="language-plaintext highlighter-rouge">REFRESH TABLE</code> SQL command or <code class="language-plaintext highlighter-rouge">HiveContext</code>&#8217;s <code class="language-plaintext highlighter-rouge">refreshTable</code> method
to include those new files to the table. For a DataFrame representing a JSON dataset, users need to recreate
the DataFrame and the new DataFrame will include new files.</p>
</li>
</ul>
<h2 id="upgrading-from-spark-sql-13-to-14">Upgrading from Spark SQL 1.3 to 1.4</h2>
<h4 class="no_toc" id="dataframe-data-readerwriter-interface">DataFrame data reader/writer interface</h4>
<p>Based on user feedback, we created a new, more fluid API for reading data in (<code class="language-plaintext highlighter-rouge">SQLContext.read</code>)
and writing data out (<code class="language-plaintext highlighter-rouge">DataFrame.write</code>),
and deprecated the old APIs (e.g., <code class="language-plaintext highlighter-rouge">SQLContext.parquetFile</code>, <code class="language-plaintext highlighter-rouge">SQLContext.jsonFile</code>).</p>
<p>See the API docs for <code class="language-plaintext highlighter-rouge">SQLContext.read</code> (
<a href="api/scala/org/apache/spark/sql/SQLContext.html#read:DataFrameReader">Scala</a>,
<a href="api/java/org/apache/spark/sql/SQLContext.html#read()">Java</a>,
<a href="api/python/reference/api/pyspark.sql.SparkSession.read.html#pyspark.sql.SparkSession.read">Python</a>
) and <code class="language-plaintext highlighter-rouge">DataFrame.write</code> (
<a href="api/scala/org/apache/spark/sql/DataFrame.html#write:DataFrameWriter">Scala</a>,
<a href="api/java/org/apache/spark/sql/Dataset.html#write()">Java</a>,
<a href="api/python/reference/api/pyspark.sql.DataFrame.write.html#pyspark.sql.DataFrame.write">Python</a>
) more information.</p>
<h4 class="no_toc" id="dataframegroupby-retains-grouping-columns">DataFrame.groupBy retains grouping columns</h4>
<p>Based on user feedback, we changed the default behavior of <code class="language-plaintext highlighter-rouge">DataFrame.groupBy().agg()</code> to retain the
grouping columns in the resulting <code class="language-plaintext highlighter-rouge">DataFrame</code>. To keep the behavior in 1.3, set <code class="language-plaintext highlighter-rouge">spark.sql.retainGroupColumns</code> to <code class="language-plaintext highlighter-rouge">false</code>.</p>
<div class="codetabs">
<div data-lang="scala">
<figure class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="c1">// In 1.3.x, in order for the grouping column "department" to show up,</span>
<span class="c1">// it must be included explicitly as part of the agg function call.</span>
<span class="nv">df</span><span class="o">.</span><span class="py">groupBy</span><span class="o">(</span><span class="s">"department"</span><span class="o">).</span><span class="py">agg</span><span class="o">(</span><span class="n">$</span><span class="s">"department"</span><span class="o">,</span> <span class="nf">max</span><span class="o">(</span><span class="s">"age"</span><span class="o">),</span> <span class="nf">sum</span><span class="o">(</span><span class="s">"expense"</span><span class="o">))</span>
<span class="c1">// In 1.4+, grouping column "department" is included automatically.</span>
<span class="nv">df</span><span class="o">.</span><span class="py">groupBy</span><span class="o">(</span><span class="s">"department"</span><span class="o">).</span><span class="py">agg</span><span class="o">(</span><span class="nf">max</span><span class="o">(</span><span class="s">"age"</span><span class="o">),</span> <span class="nf">sum</span><span class="o">(</span><span class="s">"expense"</span><span class="o">))</span>
<span class="c1">// Revert to 1.3 behavior (not retaining grouping column) by:</span>
<span class="nv">sqlContext</span><span class="o">.</span><span class="py">setConf</span><span class="o">(</span><span class="s">"spark.sql.retainGroupColumns"</span><span class="o">,</span> <span class="s">"false"</span><span class="o">)</span></code></pre></figure>
</div>
<div data-lang="java">
<figure class="highlight"><pre><code class="language-java" data-lang="java"><span class="c1">// In 1.3.x, in order for the grouping column "department" to show up,</span>
<span class="c1">// it must be included explicitly as part of the agg function call.</span>
<span class="n">df</span><span class="o">.</span><span class="na">groupBy</span><span class="o">(</span><span class="s">"department"</span><span class="o">).</span><span class="na">agg</span><span class="o">(</span><span class="n">col</span><span class="o">(</span><span class="s">"department"</span><span class="o">),</span> <span class="n">max</span><span class="o">(</span><span class="s">"age"</span><span class="o">),</span> <span class="n">sum</span><span class="o">(</span><span class="s">"expense"</span><span class="o">));</span>
<span class="c1">// In 1.4+, grouping column "department" is included automatically.</span>
<span class="n">df</span><span class="o">.</span><span class="na">groupBy</span><span class="o">(</span><span class="s">"department"</span><span class="o">).</span><span class="na">agg</span><span class="o">(</span><span class="n">max</span><span class="o">(</span><span class="s">"age"</span><span class="o">),</span> <span class="n">sum</span><span class="o">(</span><span class="s">"expense"</span><span class="o">));</span>
<span class="c1">// Revert to 1.3 behavior (not retaining grouping column) by:</span>
<span class="n">sqlContext</span><span class="o">.</span><span class="na">setConf</span><span class="o">(</span><span class="s">"spark.sql.retainGroupColumns"</span><span class="o">,</span> <span class="s">"false"</span><span class="o">);</span></code></pre></figure>
</div>
<div data-lang="python">
<figure class="highlight"><pre><code class="language-python" data-lang="python"><span class="kn">import</span> <span class="nn">pyspark.sql.functions</span> <span class="k">as</span> <span class="n">func</span>
<span class="c1"># In 1.3.x, in order for the grouping column "department" to show up,
# it must be included explicitly as part of the agg function call.
</span><span class="n">df</span><span class="p">.</span><span class="n">groupBy</span><span class="p">(</span><span class="s">"department"</span><span class="p">).</span><span class="n">agg</span><span class="p">(</span><span class="n">df</span><span class="p">[</span><span class="s">"department"</span><span class="p">],</span> <span class="n">func</span><span class="p">.</span><span class="nb">max</span><span class="p">(</span><span class="s">"age"</span><span class="p">),</span> <span class="n">func</span><span class="p">.</span><span class="nb">sum</span><span class="p">(</span><span class="s">"expense"</span><span class="p">))</span>
<span class="c1"># In 1.4+, grouping column "department" is included automatically.
</span><span class="n">df</span><span class="p">.</span><span class="n">groupBy</span><span class="p">(</span><span class="s">"department"</span><span class="p">).</span><span class="n">agg</span><span class="p">(</span><span class="n">func</span><span class="p">.</span><span class="nb">max</span><span class="p">(</span><span class="s">"age"</span><span class="p">),</span> <span class="n">func</span><span class="p">.</span><span class="nb">sum</span><span class="p">(</span><span class="s">"expense"</span><span class="p">))</span>
<span class="c1"># Revert to 1.3.x behavior (not retaining grouping column) by:
</span><span class="n">sqlContext</span><span class="p">.</span><span class="n">setConf</span><span class="p">(</span><span class="s">"spark.sql.retainGroupColumns"</span><span class="p">,</span> <span class="s">"false"</span><span class="p">)</span></code></pre></figure>
</div>
</div>
<h4 class="no_toc" id="behavior-change-on-dataframewithcolumn">Behavior change on DataFrame.withColumn</h4>
<p>Prior to 1.4, DataFrame.withColumn() supports adding a column only. The column will always be added
as a new column with its specified name in the result DataFrame even if there may be any existing
columns of the same name. Since 1.4, DataFrame.withColumn() supports adding a column of a different
name from names of all existing columns or replacing existing columns of the same name.</p>
<p>Note that this change is only for Scala API, not for PySpark and SparkR.</p>
<h2 id="upgrading-from-spark-sql-10-12-to-13">Upgrading from Spark SQL 1.0-1.2 to 1.3</h2>
<p>In Spark 1.3 we removed the &#8220;Alpha&#8221; label from Spark SQL and as part of this did a cleanup of the
available APIs. From Spark 1.3 onwards, Spark SQL will provide binary compatibility with other
releases in the 1.X series. This compatibility guarantee excludes APIs that are explicitly marked
as unstable (i.e., DeveloperAPI or Experimental).</p>
<h4 class="no_toc" id="rename-of-schemardd-to-dataframe">Rename of SchemaRDD to DataFrame</h4>
<p>The largest change that users will notice when upgrading to Spark SQL 1.3 is that <code class="language-plaintext highlighter-rouge">SchemaRDD</code> has
been renamed to <code class="language-plaintext highlighter-rouge">DataFrame</code>. This is primarily because DataFrames no longer inherit from RDD
directly, but instead provide most of the functionality that RDDs provide though their own
implementation. DataFrames can still be converted to RDDs by calling the <code class="language-plaintext highlighter-rouge">.rdd</code> method.</p>
<p>In Scala, there is a type alias from <code class="language-plaintext highlighter-rouge">SchemaRDD</code> to <code class="language-plaintext highlighter-rouge">DataFrame</code> to provide source compatibility for
some use cases. It is still recommended that users update their code to use <code class="language-plaintext highlighter-rouge">DataFrame</code> instead.
Java and Python users will need to update their code.</p>
<h4 class="no_toc" id="unification-of-the-java-and-scala-apis">Unification of the Java and Scala APIs</h4>
<p>Prior to Spark 1.3 there were separate Java compatible classes (<code class="language-plaintext highlighter-rouge">JavaSQLContext</code> and <code class="language-plaintext highlighter-rouge">JavaSchemaRDD</code>)
that mirrored the Scala API. In Spark 1.3 the Java API and Scala API have been unified. Users
of either language should use <code class="language-plaintext highlighter-rouge">SQLContext</code> and <code class="language-plaintext highlighter-rouge">DataFrame</code>. In general these classes try to
use types that are usable from both languages (i.e. <code class="language-plaintext highlighter-rouge">Array</code> instead of language-specific collections).
In some cases where no common type exists (e.g., for passing in closures or Maps) function overloading
is used instead.</p>
<p>Additionally, the Java specific types API has been removed. Users of both Scala and Java should
use the classes present in <code class="language-plaintext highlighter-rouge">org.apache.spark.sql.types</code> to describe schema programmatically.</p>
<h4 class="no_toc" id="isolation-of-implicit-conversions-and-removal-of-dsl-package-scala-only">Isolation of Implicit Conversions and Removal of dsl Package (Scala-only)</h4>
<p>Many of the code examples prior to Spark 1.3 started with <code class="language-plaintext highlighter-rouge">import sqlContext._</code>, which brought
all of the functions from sqlContext into scope. In Spark 1.3 we have isolated the implicit
conversions for converting <code class="language-plaintext highlighter-rouge">RDD</code>s into <code class="language-plaintext highlighter-rouge">DataFrame</code>s into an object inside of the <code class="language-plaintext highlighter-rouge">SQLContext</code>.
Users should now write <code class="language-plaintext highlighter-rouge">import sqlContext.implicits._</code>.</p>
<p>Additionally, the implicit conversions now only augment RDDs that are composed of <code class="language-plaintext highlighter-rouge">Product</code>s (i.e.,
case classes or tuples) with a method <code class="language-plaintext highlighter-rouge">toDF</code>, instead of applying automatically.</p>
<p>When using function inside of the DSL (now replaced with the <code class="language-plaintext highlighter-rouge">DataFrame</code> API) users used to import
<code class="language-plaintext highlighter-rouge">org.apache.spark.sql.catalyst.dsl</code>. Instead the public dataframe functions API should be used:
<code class="language-plaintext highlighter-rouge">import org.apache.spark.sql.functions._</code>.</p>
<h4 class="no_toc" id="removal-of-the-type-aliases-in-orgapachesparksql-for-datatype-scala-only">Removal of the type aliases in org.apache.spark.sql for DataType (Scala-only)</h4>
<p>Spark 1.3 removes the type aliases that were present in the base sql package for <code class="language-plaintext highlighter-rouge">DataType</code>. Users
should instead import the classes in <code class="language-plaintext highlighter-rouge">org.apache.spark.sql.types</code></p>
<h4 class="no_toc" id="udf-registration-moved-to-sqlcontextudf-java--scala">UDF Registration Moved to <code class="language-plaintext highlighter-rouge">sqlContext.udf</code> (Java &amp; Scala)</h4>
<p>Functions that are used to register UDFs, either for use in the DataFrame DSL or SQL, have been
moved into the udf object in <code class="language-plaintext highlighter-rouge">SQLContext</code>.</p>
<div class="codetabs">
<div data-lang="scala">
<figure class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="nv">sqlContext</span><span class="o">.</span><span class="py">udf</span><span class="o">.</span><span class="py">register</span><span class="o">(</span><span class="s">"strLen"</span><span class="o">,</span> <span class="o">(</span><span class="n">s</span><span class="k">:</span> <span class="kt">String</span><span class="o">)</span> <span class="k">=&gt;</span> <span class="nv">s</span><span class="o">.</span><span class="py">length</span><span class="o">())</span></code></pre></figure>
</div>
<div data-lang="java">
<figure class="highlight"><pre><code class="language-java" data-lang="java"><span class="n">sqlContext</span><span class="o">.</span><span class="na">udf</span><span class="o">().</span><span class="na">register</span><span class="o">(</span><span class="s">"strLen"</span><span class="o">,</span> <span class="o">(</span><span class="nc">String</span> <span class="n">s</span><span class="o">)</span> <span class="o">-&gt;</span> <span class="n">s</span><span class="o">.</span><span class="na">length</span><span class="o">(),</span> <span class="nc">DataTypes</span><span class="o">.</span><span class="na">IntegerType</span><span class="o">);</span></code></pre></figure>
</div>
</div>
<p>Python UDF registration is unchanged.</p>
<h2 id="compatibility-with-apache-hive">Compatibility with Apache Hive</h2>
<p>Spark SQL is designed to be compatible with the Hive Metastore, SerDes and UDFs.
Currently, Hive SerDes and UDFs are based on built-in Hive,
and Spark SQL can be connected to different versions of Hive Metastore
(from 0.12.0 to 2.3.9 and 3.0.0 to 3.1.2. Also see <a href="sql-data-sources-hive-tables.html#interacting-with-different-versions-of-hive-metastore">Interacting with Different Versions of Hive Metastore</a>).</p>
<h4 class="no_toc" id="deploying-in-existing-hive-warehouses">Deploying in Existing Hive Warehouses</h4>
<p>The Spark SQL Thrift JDBC server is designed to be &#8220;out of the box&#8221; compatible with existing Hive
installations. You do not need to modify your existing Hive Metastore or change the data placement
or partitioning of your tables.</p>
<h3 class="no_toc" id="supported-hive-features">Supported Hive Features</h3>
<p>Spark SQL supports the vast majority of Hive features, such as:</p>
<ul>
<li>Hive query statements, including:
<ul>
<li><code class="language-plaintext highlighter-rouge">SELECT</code></li>
<li><code class="language-plaintext highlighter-rouge">GROUP BY</code></li>
<li><code class="language-plaintext highlighter-rouge">ORDER BY</code></li>
<li><code class="language-plaintext highlighter-rouge">DISTRIBUTE BY</code></li>
<li><code class="language-plaintext highlighter-rouge">CLUSTER BY</code></li>
<li><code class="language-plaintext highlighter-rouge">SORT BY</code></li>
</ul>
</li>
<li>All Hive operators, including:
<ul>
<li>Relational operators (<code class="language-plaintext highlighter-rouge">=</code>, <code class="language-plaintext highlighter-rouge">&lt;=&gt;</code>, <code class="language-plaintext highlighter-rouge">==</code>, <code class="language-plaintext highlighter-rouge">&lt;&gt;</code>, <code class="language-plaintext highlighter-rouge">&lt;</code>, <code class="language-plaintext highlighter-rouge">&gt;</code>, <code class="language-plaintext highlighter-rouge">&gt;=</code>, <code class="language-plaintext highlighter-rouge">&lt;=</code>, etc)</li>
<li>Arithmetic operators (<code class="language-plaintext highlighter-rouge">+</code>, <code class="language-plaintext highlighter-rouge">-</code>, <code class="language-plaintext highlighter-rouge">*</code>, <code class="language-plaintext highlighter-rouge">/</code>, <code class="language-plaintext highlighter-rouge">%</code>, etc)</li>
<li>Logical operators (<code class="language-plaintext highlighter-rouge">AND</code>, <code class="language-plaintext highlighter-rouge">OR</code>, etc)</li>
<li>Complex type constructors</li>
<li>Mathematical functions (<code class="language-plaintext highlighter-rouge">sign</code>, <code class="language-plaintext highlighter-rouge">ln</code>, <code class="language-plaintext highlighter-rouge">cos</code>, etc)</li>
<li>String functions (<code class="language-plaintext highlighter-rouge">instr</code>, <code class="language-plaintext highlighter-rouge">length</code>, <code class="language-plaintext highlighter-rouge">printf</code>, etc)</li>
</ul>
</li>
<li>User defined functions (UDF)</li>
<li>User defined aggregation functions (UDAF)</li>
<li>User defined serialization formats (SerDes)</li>
<li>Window functions</li>
<li>Joins
<ul>
<li><code class="language-plaintext highlighter-rouge">JOIN</code></li>
<li><code class="language-plaintext highlighter-rouge">{LEFT|RIGHT|FULL} OUTER JOIN</code></li>
<li><code class="language-plaintext highlighter-rouge">LEFT SEMI JOIN</code></li>
<li><code class="language-plaintext highlighter-rouge">LEFT ANTI JOIN</code></li>
<li><code class="language-plaintext highlighter-rouge">CROSS JOIN</code></li>
</ul>
</li>
<li>Unions</li>
<li>Sub-queries
<ul>
<li>
<p>Sub-queries in the FROM Clause</p>
<p><code class="language-plaintext highlighter-rouge">SELECT col FROM (SELECT a + b AS col FROM t1) t2</code></p>
</li>
<li>
<p>Sub-queries in WHERE Clause</p>
<ul>
<li>
<p>Correlated or non-correlated IN and NOT IN statement in WHERE Clause</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>SELECT col FROM t1 WHERE col IN (SELECT a FROM t2 WHERE t1.a = t2.a)
SELECT col FROM t1 WHERE col IN (SELECT a FROM t2)
</code></pre></div> </div>
</li>
<li>
<p>Correlated or non-correlated EXISTS and NOT EXISTS statement in WHERE Clause</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>SELECT col FROM t1 WHERE EXISTS (SELECT t2.a FROM t2 WHERE t1.a = t2.a AND t2.a &gt; 10)
SELECT col FROM t1 WHERE EXISTS (SELECT t2.a FROM t2 WHERE t2.a &gt; 10)
</code></pre></div> </div>
</li>
<li>
<p>Non-correlated IN and NOT IN statement in JOIN Condition</p>
<p><code class="language-plaintext highlighter-rouge">SELECT t1.col FROM t1 JOIN t2 ON t1.a = t2.a AND t1.a IN (SELECT a FROM t3)</code></p>
</li>
<li>
<p>Non-correlated EXISTS and NOT EXISTS statement in JOIN Condition</p>
<p><code class="language-plaintext highlighter-rouge">SELECT t1.col FROM t1 JOIN t2 ON t1.a = t2.a AND EXISTS (SELECT * FROM t3 WHERE t3.a &gt; 10)</code></p>
</li>
</ul>
</li>
</ul>
</li>
<li>Sampling</li>
<li>Explain</li>
<li>Partitioned tables including dynamic partition insertion</li>
<li>View
<ul>
<li>
<p>If column aliases are not specified in view definition queries, both Spark and Hive will
generate alias names, but in different ways. In order for Spark to be able to read views created
by Hive, users should explicitly specify column aliases in view definition queries. As an
example, Spark cannot read <code class="language-plaintext highlighter-rouge">v1</code> created as below by Hive.</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>CREATE VIEW v1 AS SELECT * FROM (SELECT c + 1 FROM (SELECT 1 c) t1) t2;
</code></pre></div> </div>
<p>Instead, you should create <code class="language-plaintext highlighter-rouge">v1</code> as below with column aliases explicitly specified.</p>
<div class="language-plaintext highlighter-rouge"><div class="highlight"><pre class="highlight"><code>CREATE VIEW v1 AS SELECT * FROM (SELECT c + 1 AS inc_c FROM (SELECT 1 c) t1) t2;
</code></pre></div> </div>
</li>
</ul>
</li>
<li>All Hive DDL Functions, including:
<ul>
<li><code class="language-plaintext highlighter-rouge">CREATE TABLE</code></li>
<li><code class="language-plaintext highlighter-rouge">CREATE TABLE AS SELECT</code></li>
<li><code class="language-plaintext highlighter-rouge">CREATE TABLE LIKE</code></li>
<li><code class="language-plaintext highlighter-rouge">ALTER TABLE</code></li>
</ul>
</li>
<li>Most Hive Data types, including:
<ul>
<li><code class="language-plaintext highlighter-rouge">TINYINT</code></li>
<li><code class="language-plaintext highlighter-rouge">SMALLINT</code></li>
<li><code class="language-plaintext highlighter-rouge">INT</code></li>
<li><code class="language-plaintext highlighter-rouge">BIGINT</code></li>
<li><code class="language-plaintext highlighter-rouge">BOOLEAN</code></li>
<li><code class="language-plaintext highlighter-rouge">FLOAT</code></li>
<li><code class="language-plaintext highlighter-rouge">DOUBLE</code></li>
<li><code class="language-plaintext highlighter-rouge">STRING</code></li>
<li><code class="language-plaintext highlighter-rouge">BINARY</code></li>
<li><code class="language-plaintext highlighter-rouge">TIMESTAMP</code></li>
<li><code class="language-plaintext highlighter-rouge">DATE</code></li>
<li><code class="language-plaintext highlighter-rouge">ARRAY&lt;&gt;</code></li>
<li><code class="language-plaintext highlighter-rouge">MAP&lt;&gt;</code></li>
<li><code class="language-plaintext highlighter-rouge">STRUCT&lt;&gt;</code></li>
</ul>
</li>
</ul>
<h3 class="no_toc" id="unsupported-hive-functionality">Unsupported Hive Functionality</h3>
<p>Below is a list of Hive features that we don&#8217;t support yet. Most of these features are rarely used
in Hive deployments.</p>
<p><strong>Major Hive Features</strong></p>
<ul>
<li>Tables with buckets: bucket is the hash partitioning within a Hive table partition. Spark SQL
doesn&#8217;t support buckets yet.</li>
</ul>
<p><strong>Esoteric Hive Features</strong></p>
<ul>
<li><code class="language-plaintext highlighter-rouge">UNION</code> type</li>
<li>Unique join</li>
<li>Column statistics collecting: Spark SQL does not piggyback scans to collect column statistics at
the moment and only supports populating the sizeInBytes field of the hive metastore.</li>
</ul>
<p><strong>Hive Input/Output Formats</strong></p>
<ul>
<li>File format for CLI: For results showing back to the CLI, Spark SQL only supports TextOutputFormat.</li>
<li>Hadoop archive</li>
</ul>
<p><strong>Hive Optimizations</strong></p>
<p>A handful of Hive optimizations are not yet included in Spark. Some of these (such as indexes) are
less important due to Spark SQL&#8217;s in-memory computational model. Others are slotted for future
releases of Spark SQL.</p>
<ul>
<li>Block-level bitmap indexes and virtual columns (used to build indexes)</li>
<li>Automatically determine the number of reducers for joins and groupbys: Currently, in Spark SQL, you
need to control the degree of parallelism post-shuffle using &#8220;<code class="language-plaintext highlighter-rouge">SET spark.sql.shuffle.partitions=[num_tasks];</code>&#8221;.</li>
<li>Meta-data only query: For queries that can be answered by using only metadata, Spark SQL still
launches tasks to compute the result.</li>
<li>Skew data flag: Spark SQL does not follow the skew data flags in Hive.</li>
<li><code class="language-plaintext highlighter-rouge">STREAMTABLE</code> hint in join: Spark SQL does not follow the <code class="language-plaintext highlighter-rouge">STREAMTABLE</code> hint.</li>
<li>Merge multiple small files for query results: if the result output contains multiple small files,
Hive can optionally merge the small files into fewer large files to avoid overflowing the HDFS
metadata. Spark SQL does not support that.</li>
</ul>
<p><strong>Hive UDF/UDTF/UDAF</strong></p>
<p>Not all the APIs of the Hive UDF/UDTF/UDAF are supported by Spark SQL. Below are the unsupported APIs:</p>
<ul>
<li><code class="language-plaintext highlighter-rouge">getRequiredJars</code> and <code class="language-plaintext highlighter-rouge">getRequiredFiles</code> (<code class="language-plaintext highlighter-rouge">UDF</code> and <code class="language-plaintext highlighter-rouge">GenericUDF</code>) are functions to automatically
include additional resources required by this UDF.</li>
<li><code class="language-plaintext highlighter-rouge">initialize(StructObjectInspector)</code> in <code class="language-plaintext highlighter-rouge">GenericUDTF</code> is not supported yet. Spark SQL currently uses
a deprecated interface <code class="language-plaintext highlighter-rouge">initialize(ObjectInspector[])</code> only.</li>
<li><code class="language-plaintext highlighter-rouge">configure</code> (<code class="language-plaintext highlighter-rouge">GenericUDF</code>, <code class="language-plaintext highlighter-rouge">GenericUDTF</code>, and <code class="language-plaintext highlighter-rouge">GenericUDAFEvaluator</code>) is a function to initialize
functions with <code class="language-plaintext highlighter-rouge">MapredContext</code>, which is inapplicable to Spark.</li>
<li><code class="language-plaintext highlighter-rouge">close</code> (<code class="language-plaintext highlighter-rouge">GenericUDF</code> and <code class="language-plaintext highlighter-rouge">GenericUDAFEvaluator</code>) is a function to release associated resources.
Spark SQL does not call this function when tasks finish.</li>
<li><code class="language-plaintext highlighter-rouge">reset</code> (<code class="language-plaintext highlighter-rouge">GenericUDAFEvaluator</code>) is a function to re-initialize aggregation for reusing the same aggregation.
Spark SQL currently does not support the reuse of aggregation.</li>
<li><code class="language-plaintext highlighter-rouge">getWindowingEvaluator</code> (<code class="language-plaintext highlighter-rouge">GenericUDAFEvaluator</code>) is a function to optimize aggregation by evaluating
an aggregate over a fixed window.</li>
</ul>
<h3 class="no_toc" id="incompatible-hive-udf">Incompatible Hive UDF</h3>
<p>Below are the scenarios in which Hive and Spark generate different results:</p>
<ul>
<li><code class="language-plaintext highlighter-rouge">SQRT(n)</code> If n &lt; 0, Hive returns null, Spark SQL returns NaN.</li>
<li><code class="language-plaintext highlighter-rouge">ACOS(n)</code> If n &lt; -1 or n &gt; 1, Hive returns null, Spark SQL returns NaN.</li>
<li><code class="language-plaintext highlighter-rouge">ASIN(n)</code> If n &lt; -1 or n &gt; 1, Hive returns null, Spark SQL returns NaN.</li>
<li><code class="language-plaintext highlighter-rouge">CAST(n AS TIMESTAMP)</code> If n is integral numbers, Hive treats n as milliseconds, Spark SQL treats n as seconds.</li>
</ul>
</div>
<!-- /container -->
</div>
<script src="js/vendor/jquery-3.5.1.min.js"></script>
<script src="js/vendor/bootstrap.bundle.min.js"></script>
<script src="js/vendor/anchor.min.js"></script>
<script src="js/main.js"></script>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.js"></script>
<script type="text/javascript">
// DocSearch is entirely free and automated. DocSearch is built in two parts:
// 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link
// in your website and extract content from every page it traverses. It then pushes this
// content to an Algolia index.
// 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index
// to your search input and display its results in a dropdown UI. If you want to find more
// details on how works DocSearch, check the docs of DocSearch.
docsearch({
apiKey: 'd62f962a82bc9abb53471cb7b89da35e',
appId: 'RAI69RXRSK',
indexName: 'apache_spark',
inputSelector: '#docsearch-input',
enhancedSearchInput: true,
algoliaOptions: {
'facetFilters': ["version:3.2.4"]
},
debug: false // Set debug to true if you want to inspect the dropdown
});
</script>
<!-- MathJax Section -->
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
TeX: { equationNumbers: { autoNumber: "AMS" } }
});
</script>
<script>
// Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
// We could use "//cdn.mathjax...", but that won't support "file://".
(function(d, script) {
script = d.createElement('script');
script.type = 'text/javascript';
script.async = true;
script.onload = function(){
MathJax.Hub.Config({
tex2jax: {
inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
processEscapes: true,
skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
}
});
};
script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' +
'?config=TeX-AMS-MML_HTMLorMML';
d.getElementsByTagName('head')[0].appendChild(script);
}(document));
</script>
</body>
</html>