blob: fe012919b18ce23df8a140cd72064d1bc6f81c75 [file] [log] [blame]
<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>NULL Semantics - Spark 3.5.0 Documentation</title>
<link rel="stylesheet" href="css/bootstrap.min.css">
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,wght@0,400;0,500;0,700;1,400;1,500;1,700&Courier+Prime:wght@400;700&display=swap" rel="stylesheet">
<link href="css/custom.css" rel="stylesheet">
<script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
<link rel="stylesheet" href="css/pygments-default.css">
<link rel="stylesheet" href="css/docsearch.min.css" />
<link rel="stylesheet" href="css/docsearch.css">
<!-- Matomo -->
<script type="text/javascript">
var _paq = window._paq = window._paq || [];
/* tracker methods like "setCustomDimension" should be called before "trackPageView" */
_paq.push(["disableCookies"]);
_paq.push(['trackPageView']);
_paq.push(['enableLinkTracking']);
(function() {
var u="https://analytics.apache.org/";
_paq.push(['setTrackerUrl', u+'matomo.php']);
_paq.push(['setSiteId', '40']);
var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s);
})();
</script>
<!-- End Matomo Code -->
</head>
<body class="global">
<!--[if lt IE 7]>
<p class="chromeframe">You are using an outdated browser. <a href="https://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
<![endif]-->
<!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
<nav class="navbar navbar-expand-lg navbar-dark p-0 px-4 fixed-top" style="background: #1d6890;" id="topbar">
<div class="navbar-brand"><a href="index.html">
<img src="img/spark-logo-rev.svg" width="141" height="72"/></a><span class="version">3.5.0</span>
</div>
<button class="navbar-toggler" type="button" data-toggle="collapse"
data-target="#navbarCollapse" aria-controls="navbarCollapse"
aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarCollapse">
<ul class="navbar-nav me-auto">
<li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a>
<div class="dropdown-menu" aria-labelledby="navbarQuickStart">
<a class="dropdown-item" href="quick-start.html">Quick Start</a>
<a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a>
<a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a>
<a class="dropdown-item" href="structured-streaming-programming-guide.html">Structured Streaming</a>
<a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a>
<a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a>
<a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a>
<a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a>
<a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a>
<div class="dropdown-menu" aria-labelledby="navbarAPIDocs">
<a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a>
<a class="dropdown-item" href="api/java/index.html">Java</a>
<a class="dropdown-item" href="api/python/index.html">Python</a>
<a class="dropdown-item" href="api/R/index.html">R</a>
<a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a>
<div class="dropdown-menu" aria-labelledby="navbarDeploying">
<a class="dropdown-item" href="cluster-overview.html">Overview</a>
<a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a>
<a class="dropdown-item" href="running-on-mesos.html">Mesos</a>
<a class="dropdown-item" href="running-on-yarn.html">YARN</a>
<a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a>
<div class="dropdown-menu" aria-labelledby="navbarMore">
<a class="dropdown-item" href="configuration.html">Configuration</a>
<a class="dropdown-item" href="monitoring.html">Monitoring</a>
<a class="dropdown-item" href="tuning.html">Tuning Guide</a>
<a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a>
<a class="dropdown-item" href="security.html">Security</a>
<a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a>
<a class="dropdown-item" href="migration-guide.html">Migration Guide</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="building-spark.html">Building Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a>
</div>
</li>
<li class="nav-item">
<input type="text" id="docsearch-input" placeholder="Search the docs…">
</li>
</ul>
<!--<span class="navbar-text navbar-right"><span class="version-text">v3.5.0</span></span>-->
</div>
</nav>
<div class="container">
<div class="left-menu-wrapper">
<div class="left-menu">
<h3><a href="sql-programming-guide.html">Spark SQL Guide</a></h3>
<ul>
<li>
<a href="sql-getting-started.html">
Getting Started
</a>
</li>
<li>
<a href="sql-data-sources.html">
Data Sources
</a>
</li>
<li>
<a href="sql-performance-tuning.html">
Performance Tuning
</a>
</li>
<li>
<a href="sql-distributed-sql-engine.html">
Distributed SQL Engine
</a>
</li>
<li>
<a href="sql-pyspark-pandas-with-arrow.html">
PySpark Usage Guide for Pandas with Apache Arrow
</a>
</li>
<li>
<a href="sql-migration-guide.html">
Migration Guide
</a>
</li>
<li>
<a href="sql-ref.html">
SQL Reference
</a>
</li>
<ul>
<li>
<a href="sql-ref-ansi-compliance.html">
ANSI Compliance
</a>
</li>
<li>
<a href="sql-ref-datatypes.html">
Data Types
</a>
</li>
<li>
<a href="sql-ref-datetime-pattern.html">
Datetime Pattern
</a>
</li>
<li>
<a href="sql-ref-number-pattern.html">
Number Pattern
</a>
</li>
<li>
<a href="sql-ref-functions.html">
Functions
</a>
</li>
<li>
<a href="sql-ref-identifier.html">
Identifiers
</a>
</li>
<li>
<a href="sql-ref-literals.html">
Literals
</a>
</li>
<li>
<a href="sql-ref-null-semantics.html">
Null Semantics
</a>
</li>
<li>
<a href="sql-ref-syntax.html">
SQL Syntax
</a>
</li>
</ul>
<li>
<a href="sql-error-conditions.html">
Error Conditions
</a>
</li>
</ul>
</div>
</div>
<input id="nav-trigger" class="nav-trigger" checked type="checkbox">
<label for="nav-trigger"></label>
<div class="content-with-sidebar mr-3" id="content">
<h1 class="title">NULL Semantics</h1>
<h3 id="description">Description</h3>
<p>A table consists of a set of rows and each row contains a set of columns.
A column is associated with a data type and represents
a specific attribute of an entity (for example, <code class="language-plaintext highlighter-rouge">age</code> is a column of an
entity called <code class="language-plaintext highlighter-rouge">person</code>). Sometimes, the value of a column
specific to a row is not known at the time the row comes into existence.
In <code class="language-plaintext highlighter-rouge">SQL</code>, such values are represented as <code class="language-plaintext highlighter-rouge">NULL</code>. This section details the
semantics of <code class="language-plaintext highlighter-rouge">NULL</code> values handling in various operators, expressions and
other <code class="language-plaintext highlighter-rouge">SQL</code> constructs.</p>
<ol>
<li><a href="#comp-operators">Null handling in comparison operators</a></li>
<li><a href="#logical-operators">Null handling in Logical operators</a></li>
<li><a href="#expressions">Null handling in Expressions</a>
<ol>
<li><a href="#null-intolerant">Null handling in null-intolerant expressions</a></li>
<li><a href="#can-process-null">Null handling Expressions that can process null value operands</a></li>
<li><a href="#built-in-aggregate">Null handling in built-in aggregate expressions</a></li>
</ol>
</li>
<li><a href="#condition-expressions">Null handling in WHERE, HAVING and JOIN conditions</a></li>
<li><a href="#aggregate-operator">Null handling in GROUP BY and DISTINCT</a></li>
<li><a href="#order-by">Null handling in ORDER BY</a></li>
<li><a href="#set-operators">Null handling in UNION, INTERSECT, EXCEPT</a></li>
<li><a href="#exists-not-exists">Null handling in EXISTS and NOT EXISTS subquery</a></li>
<li><a href="#in-not-in">Null handling in IN and NOT IN subquery</a></li>
</ol>
<style type="text/css">
.tsclass {font-size:12px;color:#333333;width:40%;border-width: 2px;border-color: #729ea5;border-collapse: collapse;}
.tsclass th {text-align: left;}
</style>
<p>The following illustrates the schema layout and data of a table named <code class="language-plaintext highlighter-rouge">person</code>. The data contains <code class="language-plaintext highlighter-rouge">NULL</code> values in
the <code class="language-plaintext highlighter-rouge">age</code> column and this table will be used in various examples in the sections below.<br />
<strong><u>TABLE: person</u></strong></p>
<table>
<thead>
<tr>
<th>Id</th>
<th>Name</th>
<th>Age</th>
</tr>
</thead>
<tbody>
<tr>
<td>100</td>
<td>Joe</td>
<td>30</td>
</tr>
<tr>
<td>200</td>
<td>Marry</td>
<td>NULL</td>
</tr>
<tr>
<td>300</td>
<td>Mike</td>
<td>18</td>
</tr>
<tr>
<td>400</td>
<td>Fred</td>
<td>50</td>
</tr>
<tr>
<td>500</td>
<td>Albert</td>
<td>NULL</td>
</tr>
<tr>
<td>600</td>
<td>Michelle</td>
<td>30</td>
</tr>
<tr>
<td>700</td>
<td>Dan</td>
<td>50</td>
</tr>
</tbody>
</table>
<h3 id="comparison-operators-">Comparison Operators <a name="comp-operators"></a></h3>
<p>Apache spark supports the standard comparison operators such as &#8216;&gt;&#8217;, &#8216;&gt;=&#8217;, &#8216;=&#8217;, &#8216;&lt;&#8217; and &#8216;&lt;=&#8217;.
The result of these operators is unknown or <code class="language-plaintext highlighter-rouge">NULL</code> when one of the operands or both the operands are
unknown or <code class="language-plaintext highlighter-rouge">NULL</code>. In order to compare the <code class="language-plaintext highlighter-rouge">NULL</code> values for equality, Spark provides a null-safe
equal operator (&#8216;&lt;=&gt;&#8217;), which returns <code class="language-plaintext highlighter-rouge">False</code> when one of the operand is <code class="language-plaintext highlighter-rouge">NULL</code> and returns &#8216;True<code class="language-plaintext highlighter-rouge"> when
both the operands are </code>NULL<code class="language-plaintext highlighter-rouge">. The following table illustrates the behaviour of comparison operators when
one or both operands are </code>NULL`:</p>
<table>
<thead>
<tr>
<th>Left Operand</th>
<th>Right Operand</th>
<th>&gt;</th>
<th>&gt;=</th>
<th>=</th>
<th>&lt;</th>
<th>&lt;=</th>
<th>&lt;=&gt;</th>
</tr>
</thead>
<tbody>
<tr>
<td>NULL</td>
<td>Any value</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>False</td>
</tr>
<tr>
<td>Any value</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>False</td>
</tr>
<tr>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>True</td>
</tr>
</tbody>
</table>
<h3 id="examples">Examples</h3>
<div class="language-sql highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="c1">-- Normal comparison operators return `NULL` when one of the operand is `NULL`.</span>
<span class="k">SELECT</span> <span class="mi">5</span> <span class="o">&gt;</span> <span class="k">null</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span> <span class="k">null</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="c1">-- Normal comparison operators return `NULL` when both the operands are `NULL`.</span>
<span class="k">SELECT</span> <span class="k">null</span> <span class="o">=</span> <span class="k">null</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span> <span class="k">null</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="c1">-- Null-safe equal operator return `False` when one of the operand is `NULL`</span>
<span class="k">SELECT</span> <span class="mi">5</span> <span class="o">&lt;=&gt;</span> <span class="k">null</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span> <span class="k">false</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="c1">-- Null-safe equal operator return `True` when one of the operand is `NULL`</span>
<span class="k">SELECT</span> <span class="k">NULL</span> <span class="o">&lt;=&gt;</span> <span class="k">NULL</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span> <span class="k">true</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
</code></pre></div></div>
<h3 id="logical-operators-">Logical Operators <a name="logical-operators"></a></h3>
<p>Spark supports standard logical operators such as <code class="language-plaintext highlighter-rouge">AND</code>, <code class="language-plaintext highlighter-rouge">OR</code> and <code class="language-plaintext highlighter-rouge">NOT</code>. These operators take <code class="language-plaintext highlighter-rouge">Boolean</code> expressions
as the arguments and return a <code class="language-plaintext highlighter-rouge">Boolean</code> value.</p>
<p>The following tables illustrate the behavior of logical operators when one or both operands are <code class="language-plaintext highlighter-rouge">NULL</code>.</p>
<table>
<thead>
<tr>
<th>Left Operand</th>
<th>Right Operand</th>
<th>OR</th>
<th>AND</th>
</tr>
</thead>
<tbody>
<tr>
<td>True</td>
<td>NULL</td>
<td>True</td>
<td>NULL</td>
</tr>
<tr>
<td>False</td>
<td>NULL</td>
<td>NULL</td>
<td>False</td>
</tr>
<tr>
<td>NULL</td>
<td>True</td>
<td>True</td>
<td>NULL</td>
</tr>
<tr>
<td>NULL</td>
<td>False</td>
<td>NULL</td>
<td>False</td>
</tr>
<tr>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
</tr>
</tbody>
</table>
<table>
<thead>
<tr>
<th>operand</th>
<th>NOT</th>
</tr>
</thead>
<tbody>
<tr>
<td>NULL</td>
<td>NULL</td>
</tr>
</tbody>
</table>
<h3 id="examples-1">Examples</h3>
<div class="language-sql highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="c1">-- Normal comparison operators return `NULL` when one of the operands is `NULL`.</span>
<span class="k">SELECT</span> <span class="p">(</span><span class="k">true</span> <span class="k">OR</span> <span class="k">null</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span> <span class="k">true</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="c1">-- Normal comparison operators return `NULL` when both the operands are `NULL`.</span>
<span class="k">SELECT</span> <span class="p">(</span><span class="k">null</span> <span class="k">OR</span> <span class="k">false</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span> <span class="k">null</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="c1">-- Null-safe equal operator returns `False` when one of the operands is `NULL`</span>
<span class="k">SELECT</span> <span class="k">NOT</span><span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span> <span class="k">null</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
</code></pre></div></div>
<h3 id="expressions-">Expressions <a name="expressions"></a></h3>
<p>The comparison operators and logical operators are treated as expressions in
Spark. Other than these two kinds of expressions, Spark supports other form of
expressions such as function expressions, cast expressions, etc. The expressions
in Spark can be broadly classified as :</p>
<ul>
<li>Null intolerant expressions</li>
<li>Expressions that can process <code class="language-plaintext highlighter-rouge">NULL</code> value operands
<ul>
<li>The result of these expressions depends on the expression itself.</li>
</ul>
</li>
</ul>
<h4 id="null-intolerant-expressions-">Null Intolerant Expressions <a name="null-intolerant"></a></h4>
<p>Null intolerant expressions return <code class="language-plaintext highlighter-rouge">NULL</code> when one or more arguments of
expression are <code class="language-plaintext highlighter-rouge">NULL</code> and most of the expressions fall in this category.</p>
<h5 id="examples-2">Examples</h5>
<div class="language-sql highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">SELECT</span> <span class="n">concat</span><span class="p">(</span><span class="s1">'John'</span><span class="p">,</span> <span class="k">null</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span> <span class="k">null</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="k">SELECT</span> <span class="n">positive</span><span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span> <span class="k">null</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="k">SELECT</span> <span class="n">to_date</span><span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span> <span class="k">null</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
</code></pre></div></div>
<h4 id="expressions-that-can-process-null-value-operands-">Expressions That Can Process Null Value Operands <a name="can-process-null"></a></h4>
<p>This class of expressions are designed to handle <code class="language-plaintext highlighter-rouge">NULL</code> values. The result of the
expressions depends on the expression itself. As an example, function expression <code class="language-plaintext highlighter-rouge">isnull</code>
returns a <code class="language-plaintext highlighter-rouge">true</code> on null input and <code class="language-plaintext highlighter-rouge">false</code> on non null input where as function <code class="language-plaintext highlighter-rouge">coalesce</code>
returns the first non <code class="language-plaintext highlighter-rouge">NULL</code> value in its list of operands. However, <code class="language-plaintext highlighter-rouge">coalesce</code> returns
<code class="language-plaintext highlighter-rouge">NULL</code> when all its operands are <code class="language-plaintext highlighter-rouge">NULL</code>. Below is an incomplete list of expressions of this category.</p>
<ul>
<li>COALESCE</li>
<li>NULLIF</li>
<li>IFNULL</li>
<li>NVL</li>
<li>NVL2</li>
<li>ISNAN</li>
<li>NANVL</li>
<li>ISNULL</li>
<li>ISNOTNULL</li>
<li>ATLEASTNNONNULLS</li>
<li>IN</li>
</ul>
<h5 id="examples-3">Examples</h5>
<div class="language-sql highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">SELECT</span> <span class="k">isnull</span><span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span> <span class="k">true</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="c1">-- Returns the first occurrence of non `NULL` value.</span>
<span class="k">SELECT</span> <span class="n">coalesce</span><span class="p">(</span><span class="k">null</span><span class="p">,</span> <span class="k">null</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="k">null</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span> <span class="mi">3</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="c1">-- Returns `NULL` as all its operands are `NULL`. </span>
<span class="k">SELECT</span> <span class="n">coalesce</span><span class="p">(</span><span class="k">null</span><span class="p">,</span> <span class="k">null</span><span class="p">,</span> <span class="k">null</span><span class="p">,</span> <span class="k">null</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span> <span class="k">null</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="k">SELECT</span> <span class="n">isnan</span><span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span> <span class="k">false</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
</code></pre></div></div>
<h4 id="builtin-aggregate-expressions-">Builtin Aggregate Expressions <a name="built-in-aggregate"></a></h4>
<p>Aggregate functions compute a single result by processing a set of input rows. Below are
the rules of how <code class="language-plaintext highlighter-rouge">NULL</code> values are handled by aggregate functions.</p>
<ul>
<li><code class="language-plaintext highlighter-rouge">NULL</code> values are ignored from processing by all the aggregate functions.
<ul>
<li>Only exception to this rule is COUNT(*) function.</li>
</ul>
</li>
<li>Some aggregate functions return <code class="language-plaintext highlighter-rouge">NULL</code> when all input values are <code class="language-plaintext highlighter-rouge">NULL</code> or the input data set
is empty.<br /> The list of these functions is:
<ul>
<li>MAX</li>
<li>MIN</li>
<li>SUM</li>
<li>AVG</li>
<li>EVERY</li>
<li>ANY</li>
<li>SOME</li>
</ul>
</li>
</ul>
<h4 id="examples-4">Examples</h4>
<div class="language-sql highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="c1">-- `count(*)` does not skip `NULL` values.</span>
<span class="k">SELECT</span> <span class="k">count</span><span class="p">(</span><span class="o">*</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">person</span><span class="p">;</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="o">|</span><span class="k">count</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="o">|</span> <span class="mi">7</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="c1">-- `NULL` values in column `age` are skipped from processing.</span>
<span class="k">SELECT</span> <span class="k">count</span><span class="p">(</span><span class="n">age</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">person</span><span class="p">;</span>
<span class="o">+</span><span class="c1">----------+</span>
<span class="o">|</span><span class="k">count</span><span class="p">(</span><span class="n">age</span><span class="p">)</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----------+</span>
<span class="o">|</span> <span class="mi">5</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----------+</span>
<span class="c1">-- `count(*)` on an empty input set returns 0. This is unlike the other</span>
<span class="c1">-- aggregate functions, such as `max`, which return `NULL`.</span>
<span class="k">SELECT</span> <span class="k">count</span><span class="p">(</span><span class="o">*</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">where</span> <span class="mi">1</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="o">|</span><span class="k">count</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="o">|</span> <span class="mi">0</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="c1">-- `NULL` values are excluded from computation of maximum value.</span>
<span class="k">SELECT</span> <span class="k">max</span><span class="p">(</span><span class="n">age</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">person</span><span class="p">;</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="o">|</span><span class="k">max</span><span class="p">(</span><span class="n">age</span><span class="p">)</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="c1">-- `max` returns `NULL` on an empty input set.</span>
<span class="k">SELECT</span> <span class="k">max</span><span class="p">(</span><span class="n">age</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">where</span> <span class="mi">1</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="o">|</span><span class="k">max</span><span class="p">(</span><span class="n">age</span><span class="p">)</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="o">|</span> <span class="k">null</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+</span>
</code></pre></div></div>
<h3 id="condition-expressions-in-where-having-and-join-clauses-">Condition Expressions in WHERE, HAVING and JOIN Clauses <a name="condition-expressions"></a></h3>
<p><code class="language-plaintext highlighter-rouge">WHERE</code>, <code class="language-plaintext highlighter-rouge">HAVING</code> operators filter rows based on the user specified condition.
A <code class="language-plaintext highlighter-rouge">JOIN</code> operator is used to combine rows from two tables based on a join condition.
For all the three operators, a condition expression is a boolean expression and can return
<code>True, False or Unknown (NULL)</code>. They are &#8220;satisfied&#8221; if the result of the condition is <code class="language-plaintext highlighter-rouge">True</code>.</p>
<h4 id="examples-5">Examples</h4>
<div class="language-sql highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="c1">-- Persons whose age is unknown (`NULL`) are filtered out from the result set.</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="n">age</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">;</span>
<span class="o">+</span><span class="c1">--------+---+</span>
<span class="o">|</span> <span class="n">name</span><span class="o">|</span><span class="n">age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+---+</span>
<span class="o">|</span><span class="n">Michelle</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Fred</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Mike</span><span class="o">|</span> <span class="mi">18</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Dan</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Joe</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+---+</span>
<span class="c1">-- `IS NULL` expression is used in disjunction to select the persons</span>
<span class="c1">-- with unknown (`NULL`) records.</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="n">age</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="k">OR</span> <span class="n">age</span> <span class="k">IS</span> <span class="k">NULL</span><span class="p">;</span>
<span class="o">+</span><span class="c1">--------+----+</span>
<span class="o">|</span> <span class="n">name</span><span class="o">|</span> <span class="n">age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+----+</span>
<span class="o">|</span> <span class="n">Albert</span><span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span><span class="n">Michelle</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Fred</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Mike</span><span class="o">|</span> <span class="mi">18</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Dan</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Marry</span><span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Joe</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+----+</span>
<span class="c1">-- Person with unknown(`NULL`) ages are skipped from processing.</span>
<span class="k">SELECT</span> <span class="n">age</span><span class="p">,</span> <span class="k">count</span><span class="p">(</span><span class="o">*</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">GROUP</span> <span class="k">BY</span> <span class="n">age</span> <span class="k">HAVING</span> <span class="k">max</span><span class="p">(</span><span class="n">age</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">18</span><span class="p">;</span>
<span class="o">+</span><span class="c1">---+--------+</span>
<span class="o">|</span><span class="n">age</span><span class="o">|</span><span class="k">count</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">|</span>
<span class="o">+</span><span class="c1">---+--------+</span>
<span class="o">|</span> <span class="mi">50</span><span class="o">|</span> <span class="mi">2</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">30</span><span class="o">|</span> <span class="mi">2</span><span class="o">|</span>
<span class="o">+</span><span class="c1">---+--------+</span>
<span class="c1">-- A self join case with a join condition `p1.age = p2.age AND p1.name = p2.name`.</span>
<span class="c1">-- The persons with unknown age (`NULL`) are filtered out by the join operator.</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="n">p1</span><span class="p">,</span> <span class="n">person</span> <span class="n">p2</span>
<span class="k">WHERE</span> <span class="n">p1</span><span class="p">.</span><span class="n">age</span> <span class="o">=</span> <span class="n">p2</span><span class="p">.</span><span class="n">age</span>
<span class="k">AND</span> <span class="n">p1</span><span class="p">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">p2</span><span class="p">.</span><span class="n">name</span><span class="p">;</span>
<span class="o">+</span><span class="c1">--------+---+--------+---+</span>
<span class="o">|</span> <span class="n">name</span><span class="o">|</span><span class="n">age</span><span class="o">|</span> <span class="n">name</span><span class="o">|</span><span class="n">age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+---+--------+---+</span>
<span class="o">|</span><span class="n">Michelle</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span><span class="n">Michelle</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Fred</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span> <span class="n">Fred</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Mike</span><span class="o">|</span> <span class="mi">18</span><span class="o">|</span> <span class="n">Mike</span><span class="o">|</span> <span class="mi">18</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Dan</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span> <span class="n">Dan</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Joe</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span> <span class="n">Joe</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+---+--------+---+</span>
<span class="c1">-- The age column from both legs of join are compared using null-safe equal which</span>
<span class="c1">-- is why the persons with unknown age (`NULL`) are qualified by the join.</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="n">p1</span><span class="p">,</span> <span class="n">person</span> <span class="n">p2</span>
<span class="k">WHERE</span> <span class="n">p1</span><span class="p">.</span><span class="n">age</span> <span class="o">&lt;=&gt;</span> <span class="n">p2</span><span class="p">.</span><span class="n">age</span>
<span class="k">AND</span> <span class="n">p1</span><span class="p">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">p2</span><span class="p">.</span><span class="n">name</span><span class="p">;</span>
<span class="o">+</span><span class="c1">--------+----+--------+----+</span>
<span class="o">|</span> <span class="n">name</span><span class="o">|</span> <span class="n">age</span><span class="o">|</span> <span class="n">name</span><span class="o">|</span> <span class="n">age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+----+--------+----+</span>
<span class="o">|</span> <span class="n">Albert</span><span class="o">|</span><span class="k">null</span><span class="o">|</span> <span class="n">Albert</span><span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span><span class="n">Michelle</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span><span class="n">Michelle</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Fred</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span> <span class="n">Fred</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Mike</span><span class="o">|</span> <span class="mi">18</span><span class="o">|</span> <span class="n">Mike</span><span class="o">|</span> <span class="mi">18</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Dan</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span> <span class="n">Dan</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Marry</span><span class="o">|</span><span class="k">null</span><span class="o">|</span> <span class="n">Marry</span><span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Joe</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span> <span class="n">Joe</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+----+--------+----+</span>
</code></pre></div></div>
<h3 id="aggregate-operator-group-by-distinct-">Aggregate Operator (GROUP BY, DISTINCT) <a name="aggregate-operator"></a></h3>
<p>As discussed in the previous section <a href="sql-ref-null-semantics.html#comparison-operators">comparison operator</a>,
two <code class="language-plaintext highlighter-rouge">NULL</code> values are not equal. However, for the purpose of grouping and distinct processing, the two or more
values with <code class="language-plaintext highlighter-rouge">NULL data</code>are grouped together into the same bucket. This behaviour is conformant with SQL
standard and with other enterprise database management systems.</p>
<h4 id="examples-6">Examples</h4>
<div class="language-sql highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="c1">-- `NULL` values are put in one bucket in `GROUP BY` processing.</span>
<span class="k">SELECT</span> <span class="n">age</span><span class="p">,</span> <span class="k">count</span><span class="p">(</span><span class="o">*</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">GROUP</span> <span class="k">BY</span> <span class="n">age</span><span class="p">;</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="o">|</span> <span class="n">age</span><span class="o">|</span><span class="k">count</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="o">|</span><span class="k">null</span><span class="o">|</span> <span class="mi">2</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">50</span><span class="o">|</span> <span class="mi">2</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">30</span><span class="o">|</span> <span class="mi">2</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">18</span><span class="o">|</span> <span class="mi">1</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="c1">-- All `NULL` ages are considered one distinct value in `DISTINCT` processing.</span>
<span class="k">SELECT</span> <span class="k">DISTINCT</span> <span class="n">age</span> <span class="k">FROM</span> <span class="n">person</span><span class="p">;</span>
<span class="o">+</span><span class="c1">----+</span>
<span class="o">|</span> <span class="n">age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----+</span>
<span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">30</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">18</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----+</span>
</code></pre></div></div>
<h3 id="sort-operator-order-by-clause-">Sort Operator (ORDER BY Clause) <a name="order-by"></a></h3>
<p>Spark SQL supports null ordering specification in <code class="language-plaintext highlighter-rouge">ORDER BY</code> clause. Spark processes the <code class="language-plaintext highlighter-rouge">ORDER BY</code> clause by
placing all the <code class="language-plaintext highlighter-rouge">NULL</code> values at first or at last depending on the null ordering specification. By default, all
the <code class="language-plaintext highlighter-rouge">NULL</code> values are placed at first.</p>
<h4 id="examples-7">Examples</h4>
<div class="language-sql highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="c1">-- `NULL` values are shown at first and other values</span>
<span class="c1">-- are sorted in ascending way.</span>
<span class="k">SELECT</span> <span class="n">age</span><span class="p">,</span> <span class="n">name</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">ORDER</span> <span class="k">BY</span> <span class="n">age</span><span class="p">;</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="o">|</span> <span class="n">age</span><span class="o">|</span> <span class="n">name</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="o">|</span><span class="k">null</span><span class="o">|</span> <span class="n">Marry</span><span class="o">|</span>
<span class="o">|</span><span class="k">null</span><span class="o">|</span> <span class="n">Albert</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">18</span><span class="o">|</span> <span class="n">Mike</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">30</span><span class="o">|</span><span class="n">Michelle</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">30</span><span class="o">|</span> <span class="n">Joe</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">50</span><span class="o">|</span> <span class="n">Fred</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">50</span><span class="o">|</span> <span class="n">Dan</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="c1">-- Column values other than `NULL` are sorted in ascending</span>
<span class="c1">-- way and `NULL` values are shown at the last.</span>
<span class="k">SELECT</span> <span class="n">age</span><span class="p">,</span> <span class="n">name</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">ORDER</span> <span class="k">BY</span> <span class="n">age</span> <span class="n">NULLS</span> <span class="k">LAST</span><span class="p">;</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="o">|</span> <span class="n">age</span><span class="o">|</span> <span class="n">name</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="o">|</span> <span class="mi">18</span><span class="o">|</span> <span class="n">Mike</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">30</span><span class="o">|</span><span class="n">Michelle</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">30</span><span class="o">|</span> <span class="n">Joe</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">50</span><span class="o">|</span> <span class="n">Dan</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">50</span><span class="o">|</span> <span class="n">Fred</span><span class="o">|</span>
<span class="o">|</span><span class="k">null</span><span class="o">|</span> <span class="n">Marry</span><span class="o">|</span>
<span class="o">|</span><span class="k">null</span><span class="o">|</span> <span class="n">Albert</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="c1">-- Columns other than `NULL` values are sorted in descending</span>
<span class="c1">-- and `NULL` values are shown at the last.</span>
<span class="k">SELECT</span> <span class="n">age</span><span class="p">,</span> <span class="n">name</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">ORDER</span> <span class="k">BY</span> <span class="n">age</span> <span class="k">DESC</span> <span class="n">NULLS</span> <span class="k">LAST</span><span class="p">;</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="o">|</span> <span class="n">age</span><span class="o">|</span> <span class="n">name</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="o">|</span> <span class="mi">50</span><span class="o">|</span> <span class="n">Fred</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">50</span><span class="o">|</span> <span class="n">Dan</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">30</span><span class="o">|</span><span class="n">Michelle</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">30</span><span class="o">|</span> <span class="n">Joe</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">18</span><span class="o">|</span> <span class="n">Mike</span><span class="o">|</span>
<span class="o">|</span><span class="k">null</span><span class="o">|</span> <span class="n">Marry</span><span class="o">|</span>
<span class="o">|</span><span class="k">null</span><span class="o">|</span> <span class="n">Albert</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----+--------+</span>
</code></pre></div></div>
<h3 id="set-operators-union-intersect-except-">Set Operators (UNION, INTERSECT, EXCEPT) <a name="set-operators"></a></h3>
<p><code class="language-plaintext highlighter-rouge">NULL</code> values are compared in a null-safe manner for equality in the context of
set operations. That means when comparing rows, two <code class="language-plaintext highlighter-rouge">NULL</code> values are considered
equal unlike the regular <code class="language-plaintext highlighter-rouge">EqualTo</code>(<code class="language-plaintext highlighter-rouge">=</code>) operator.</p>
<h4 id="examples-8">Examples</h4>
<div class="language-sql highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">CREATE</span> <span class="k">VIEW</span> <span class="n">unknown_age</span> <span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="n">age</span> <span class="k">IS</span> <span class="k">NULL</span><span class="p">;</span>
<span class="c1">-- Only common rows between two legs of `INTERSECT` are in the </span>
<span class="c1">-- result set. The comparison between columns of the row are done</span>
<span class="c1">-- in a null-safe manner.</span>
<span class="k">SELECT</span> <span class="n">name</span><span class="p">,</span> <span class="n">age</span> <span class="k">FROM</span> <span class="n">person</span>
<span class="k">INTERSECT</span>
<span class="k">SELECT</span> <span class="n">name</span><span class="p">,</span> <span class="n">age</span> <span class="k">from</span> <span class="n">unknown_age</span><span class="p">;</span>
<span class="o">+</span><span class="c1">------+----+</span>
<span class="o">|</span> <span class="n">name</span><span class="o">|</span> <span class="n">age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">------+----+</span>
<span class="o">|</span><span class="n">Albert</span><span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Marry</span><span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">+</span><span class="c1">------+----+</span>
<span class="c1">-- `NULL` values from two legs of the `EXCEPT` are not in output. </span>
<span class="c1">-- This basically shows that the comparison happens in a null-safe manner.</span>
<span class="k">SELECT</span> <span class="n">age</span><span class="p">,</span> <span class="n">name</span> <span class="k">FROM</span> <span class="n">person</span>
<span class="k">EXCEPT</span>
<span class="k">SELECT</span> <span class="n">age</span> <span class="k">FROM</span> <span class="n">unknown_age</span><span class="p">;</span>
<span class="o">+</span><span class="c1">---+--------+</span>
<span class="o">|</span><span class="n">age</span><span class="o">|</span> <span class="n">name</span><span class="o">|</span>
<span class="o">+</span><span class="c1">---+--------+</span>
<span class="o">|</span> <span class="mi">30</span><span class="o">|</span> <span class="n">Joe</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">50</span><span class="o">|</span> <span class="n">Fred</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">30</span><span class="o">|</span><span class="n">Michelle</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">18</span><span class="o">|</span> <span class="n">Mike</span><span class="o">|</span>
<span class="o">|</span> <span class="mi">50</span><span class="o">|</span> <span class="n">Dan</span><span class="o">|</span>
<span class="o">+</span><span class="c1">---+--------+</span>
<span class="c1">-- Performs `UNION` operation between two sets of data. </span>
<span class="c1">-- The comparison between columns of the row ae done in</span>
<span class="c1">-- null-safe manner.</span>
<span class="k">SELECT</span> <span class="n">name</span><span class="p">,</span> <span class="n">age</span> <span class="k">FROM</span> <span class="n">person</span>
<span class="k">UNION</span>
<span class="k">SELECT</span> <span class="n">name</span><span class="p">,</span> <span class="n">age</span> <span class="k">FROM</span> <span class="n">unknown_age</span><span class="p">;</span>
<span class="o">+</span><span class="c1">--------+----+</span>
<span class="o">|</span> <span class="n">name</span><span class="o">|</span> <span class="n">age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+----+</span>
<span class="o">|</span> <span class="n">Albert</span><span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Joe</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span>
<span class="o">|</span><span class="n">Michelle</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Marry</span><span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Fred</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Mike</span><span class="o">|</span> <span class="mi">18</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Dan</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+----+</span>
</code></pre></div></div>
<h3 id="existsnot-exists-subquery-">EXISTS/NOT EXISTS Subquery <a name="exists-not-exists"></a></h3>
<p>In Spark, EXISTS and NOT EXISTS expressions are allowed inside a WHERE clause.
These are boolean expressions which return either <code class="language-plaintext highlighter-rouge">TRUE</code> or
<code class="language-plaintext highlighter-rouge">FALSE</code>. In other words, EXISTS is a membership condition and returns <code class="language-plaintext highlighter-rouge">TRUE</code>
when the subquery it refers to returns one or more rows. Similarly, NOT EXISTS
is a non-membership condition and returns TRUE when no rows or zero rows are
returned from the subquery.</p>
<p>These two expressions are not affected by presence of NULL in the result of
the subquery. They are normally faster because they can be converted to
semijoins / anti-semijoins without special provisions for null awareness.</p>
<h4 id="examples-9">Examples</h4>
<div class="language-sql highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="c1">-- Even if subquery produces rows with `NULL` values, the `EXISTS` expression</span>
<span class="c1">-- evaluates to `TRUE` as the subquery produces 1 row.</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="k">EXISTS</span> <span class="p">(</span><span class="k">SELECT</span> <span class="k">null</span><span class="p">);</span>
<span class="o">+</span><span class="c1">--------+----+</span>
<span class="o">|</span> <span class="n">name</span><span class="o">|</span> <span class="n">age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+----+</span>
<span class="o">|</span> <span class="n">Albert</span><span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span><span class="n">Michelle</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Fred</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Mike</span><span class="o">|</span> <span class="mi">18</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Dan</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Marry</span><span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Joe</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+----+</span>
<span class="c1">-- `NOT EXISTS` expression returns `FALSE`. It returns `TRUE` only when</span>
<span class="c1">-- subquery produces no rows. In this case, it returns 1 row.</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="k">NOT</span> <span class="k">EXISTS</span> <span class="p">(</span><span class="k">SELECT</span> <span class="k">null</span><span class="p">);</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="o">|</span><span class="n">name</span><span class="o">|</span><span class="n">age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="c1">-- `NOT EXISTS` expression returns `TRUE`.</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="k">NOT</span> <span class="k">EXISTS</span> <span class="p">(</span><span class="k">SELECT</span> <span class="mi">1</span> <span class="k">WHERE</span> <span class="mi">1</span> <span class="o">=</span> <span class="mi">0</span><span class="p">);</span>
<span class="o">+</span><span class="c1">--------+----+</span>
<span class="o">|</span> <span class="n">name</span><span class="o">|</span> <span class="n">age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+----+</span>
<span class="o">|</span> <span class="n">Albert</span><span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span><span class="n">Michelle</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Fred</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Mike</span><span class="o">|</span> <span class="mi">18</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Dan</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Marry</span><span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Joe</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+----+</span>
</code></pre></div></div>
<h3 id="innot-in-subquery-">IN/NOT IN Subquery <a name="in-not-in"></a></h3>
<p>In Spark, <code class="language-plaintext highlighter-rouge">IN</code> and <code class="language-plaintext highlighter-rouge">NOT IN</code> expressions are allowed inside a WHERE clause of
a query. Unlike the <code class="language-plaintext highlighter-rouge">EXISTS</code> expression, <code class="language-plaintext highlighter-rouge">IN</code> expression can return a <code class="language-plaintext highlighter-rouge">TRUE</code>,
<code class="language-plaintext highlighter-rouge">FALSE</code> or <code class="language-plaintext highlighter-rouge">UNKNOWN (NULL)</code> value. Conceptually a <code class="language-plaintext highlighter-rouge">IN</code> expression is semantically
equivalent to a set of equality condition separated by a disjunctive operator (<code class="language-plaintext highlighter-rouge">OR</code>).
For example, c1 IN (1, 2, 3) is semantically equivalent to <code class="language-plaintext highlighter-rouge">(C1 = 1 OR c1 = 2 OR c1 = 3)</code>.</p>
<p>As far as handling <code class="language-plaintext highlighter-rouge">NULL</code> values are concerned, the semantics can be deduced from
the <code class="language-plaintext highlighter-rouge">NULL</code> value handling in comparison operators(<code class="language-plaintext highlighter-rouge">=</code>) and logical operators(<code class="language-plaintext highlighter-rouge">OR</code>).
To summarize, below are the rules for computing the result of an <code class="language-plaintext highlighter-rouge">IN</code> expression.</p>
<ul>
<li>TRUE is returned when the non-NULL value in question is found in the list</li>
<li>FALSE is returned when the non-NULL value is not found in the list and the
list does not contain NULL values</li>
<li>UNKNOWN is returned when the value is <code class="language-plaintext highlighter-rouge">NULL</code>, or the non-NULL value is not found in the list
and the list contains at least one <code class="language-plaintext highlighter-rouge">NULL</code> value</li>
</ul>
<p>NOT IN always returns UNKNOWN when the list contains <code class="language-plaintext highlighter-rouge">NULL</code>, regardless of the input value.
This is because IN returns UNKNOWN if the value is not in the list containing <code class="language-plaintext highlighter-rouge">NULL</code>,
and because NOT UNKNOWN is again UNKNOWN.</p>
<h4 id="examples-10">Examples</h4>
<div class="language-sql highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="c1">-- The subquery has only `NULL` value in its result set. Therefore,</span>
<span class="c1">-- the result of `IN` predicate is UNKNOWN.</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="n">age</span> <span class="k">IN</span> <span class="p">(</span><span class="k">SELECT</span> <span class="k">null</span><span class="p">);</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="o">|</span><span class="n">name</span><span class="o">|</span><span class="n">age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="c1">-- The subquery has `NULL` value in the result set as well as a valid </span>
<span class="c1">-- value `50`. Rows with age = 50 are returned. </span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span>
<span class="k">WHERE</span> <span class="n">age</span> <span class="k">IN</span> <span class="p">(</span><span class="k">SELECT</span> <span class="n">age</span> <span class="k">FROM</span> <span class="k">VALUES</span> <span class="p">(</span><span class="mi">50</span><span class="p">),</span> <span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="n">sub</span><span class="p">(</span><span class="n">age</span><span class="p">));</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="o">|</span><span class="n">name</span><span class="o">|</span><span class="n">age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="o">|</span><span class="n">Fred</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Dan</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="c1">-- Since subquery has `NULL` value in the result set, the `NOT IN`</span>
<span class="c1">-- predicate would return UNKNOWN. Hence, no rows are</span>
<span class="c1">-- qualified for this query.</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span>
<span class="k">WHERE</span> <span class="n">age</span> <span class="k">NOT</span> <span class="k">IN</span> <span class="p">(</span><span class="k">SELECT</span> <span class="n">age</span> <span class="k">FROM</span> <span class="k">VALUES</span> <span class="p">(</span><span class="mi">50</span><span class="p">),</span> <span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="n">sub</span><span class="p">(</span><span class="n">age</span><span class="p">));</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="o">|</span><span class="n">name</span><span class="o">|</span><span class="n">age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="o">+</span><span class="c1">----+---+</span>
</code></pre></div></div>
</div>
<!-- /container -->
</div>
<script src="js/vendor/jquery-3.5.1.min.js"></script>
<script src="js/vendor/bootstrap.bundle.min.js"></script>
<script src="js/vendor/anchor.min.js"></script>
<script src="js/main.js"></script>
<script type="text/javascript" src="js/vendor/docsearch.min.js"></script>
<script type="text/javascript">
// DocSearch is entirely free and automated. DocSearch is built in two parts:
// 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link
// in your website and extract content from every page it traverses. It then pushes this
// content to an Algolia index.
// 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index
// to your search input and display its results in a dropdown UI. If you want to find more
// details on how works DocSearch, check the docs of DocSearch.
docsearch({
apiKey: 'd62f962a82bc9abb53471cb7b89da35e',
appId: 'RAI69RXRSK',
indexName: 'apache_spark',
inputSelector: '#docsearch-input',
enhancedSearchInput: true,
algoliaOptions: {
'facetFilters': ["version:3.5.0"]
},
debug: false // Set debug to true if you want to inspect the dropdown
});
</script>
<!-- MathJax Section -->
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
TeX: { equationNumbers: { autoNumber: "AMS" } }
});
</script>
<script>
// Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
// We could use "//cdn.mathjax...", but that won't support "file://".
(function(d, script) {
script = d.createElement('script');
script.type = 'text/javascript';
script.async = true;
script.onload = function(){
MathJax.Hub.Config({
tex2jax: {
inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
processEscapes: true,
skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
}
});
};
script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' +
'?config=TeX-AMS-MML_HTMLorMML';
d.getElementsByTagName('head')[0].appendChild(script);
}(document));
</script>
</body>
</html>