blob: f1fcc500fd9bb90bc46981b8af5eb95fab543655 [file] [log] [blame]
<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<title>NULL Semantics - Spark 3.0.0-preview2 Documentation</title>
<link rel="stylesheet" href="css/bootstrap.min.css">
<style>
body {
padding-top: 60px;
padding-bottom: 40px;
}
</style>
<meta name="viewport" content="width=device-width">
<link rel="stylesheet" href="css/bootstrap-responsive.min.css">
<link rel="stylesheet" href="css/main.css">
<script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
<link rel="stylesheet" href="css/pygments-default.css">
<!-- Google analytics script -->
<script type="text/javascript">
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-32518208-2']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
</script>
</head>
<body>
<!--[if lt IE 7]>
<p class="chromeframe">You are using an outdated browser. <a href="https://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
<![endif]-->
<!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
<div class="navbar navbar-fixed-top" id="topbar">
<div class="navbar-inner">
<div class="container">
<div class="brand"><a href="index.html">
<img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">3.0.0-preview2</span>
</div>
<ul class="nav">
<!--TODO(andyk): Add class="active" attribute to li some how.-->
<li><a href="index.html">Overview</a></li>
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown">Programming Guides<b class="caret"></b></a>
<ul class="dropdown-menu">
<li><a href="quick-start.html">Quick Start</a></li>
<li><a href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a></li>
<li><a href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a></li>
<li><a href="structured-streaming-programming-guide.html">Structured Streaming</a></li>
<li><a href="streaming-programming-guide.html">Spark Streaming (DStreams)</a></li>
<li><a href="ml-guide.html">MLlib (Machine Learning)</a></li>
<li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li>
<li><a href="sparkr.html">SparkR (R on Spark)</a></li>
</ul>
</li>
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a>
<ul class="dropdown-menu">
<li><a href="api/scala/index.html#org.apache.spark.package">Scala</a></li>
<li><a href="api/java/index.html">Java</a></li>
<li><a href="api/python/index.html">Python</a></li>
<li><a href="api/R/index.html">R</a></li>
<li><a href="api/sql/index.html">SQL, Built-in Functions</a></li>
</ul>
</li>
<li class="dropdown">
<a href="#" class="dropdown-toggle" data-toggle="dropdown">Deploying<b class="caret"></b></a>
<ul class="dropdown-menu">
<li><a href="cluster-overview.html">Overview</a></li>
<li><a href="submitting-applications.html">Submitting Applications</a></li>
<li class="divider"></li>
<li><a href="spark-standalone.html">Spark Standalone</a></li>
<li><a href="running-on-mesos.html">Mesos</a></li>
<li><a href="running-on-yarn.html">YARN</a></li>
<li><a href="running-on-kubernetes.html">Kubernetes</a></li>
</ul>
</li>
<li class="dropdown">
<a href="api.html" class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a>
<ul class="dropdown-menu">
<li><a href="configuration.html">Configuration</a></li>
<li><a href="monitoring.html">Monitoring</a></li>
<li><a href="tuning.html">Tuning Guide</a></li>
<li><a href="job-scheduling.html">Job Scheduling</a></li>
<li><a href="security.html">Security</a></li>
<li><a href="hardware-provisioning.html">Hardware Provisioning</a></li>
<li><a href="migration-guide.html">Migration Guide</a></li>
<li class="divider"></li>
<li><a href="building-spark.html">Building Spark</a></li>
<li><a href="https://spark.apache.org/contributing.html">Contributing to Spark</a></li>
<li><a href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a></li>
</ul>
</li>
</ul>
<!--<p class="navbar-text pull-right"><span class="version-text">v3.0.0-preview2</span></p>-->
</div>
</div>
</div>
<div class="container-wrapper">
<div class="left-menu-wrapper">
<div class="left-menu">
<h3><a href="sql-programming-guide.html">Spark SQL Guide</a></h3>
<ul>
<li>
<a href="sql-getting-started.html">
Getting Started
</a>
</li>
<li>
<a href="sql-data-sources.html">
Data Sources
</a>
</li>
<li>
<a href="sql-performance-tuning.html">
Performance Tuning
</a>
</li>
<li>
<a href="sql-distributed-sql-engine.html">
Distributed SQL Engine
</a>
</li>
<li>
<a href="sql-pyspark-pandas-with-arrow.html">
PySpark Usage Guide for Pandas with Apache Arrow
</a>
</li>
<li>
<a href="sql-migration-old.html">
Migration Guide
</a>
</li>
<li>
<a href="sql-ref.html">
SQL Reference
</a>
</li>
<ul>
<li>
<a href="sql-ref-datatypes.html">
Data Types
</a>
</li>
<li>
<a href="sql-ref-null-semantics.html">
<b>Null Semantics</b>
</a>
</li>
<li>
<a href="sql-ref-nan-semantics.html">
NaN Semantics
</a>
</li>
<li>
<a href="sql-ref-syntax.html">
SQL Syntax
</a>
</li>
<li>
<a href="sql-ref-functions.html">
Functions
</a>
</li>
<li>
<a href="sql-ref-arithmetic-ops.html">
Arthmetic operations
</a>
</li>
</ul>
</ul>
</div>
</div>
<input id="nav-trigger" class="nav-trigger" checked type="checkbox">
<label for="nav-trigger"></label>
<div class="content-with-sidebar" id="content">
<h1 class="title">NULL Semantics</h1>
<h3 id="description">Description</h3>
<p>A table consists of a set of rows and each row contains a set of columns.
A column is associated with a data type and represents
a specific attribute of an entity (for example, <code class="highlighter-rouge">age</code> is a column of an
entity called <code class="highlighter-rouge">person</code>). Sometimes, the value of a column
specific to a row is not known at the time the row comes into existence.
In <code class="highlighter-rouge">SQL</code>, such values are represnted as <code class="highlighter-rouge">NULL</code>. This section details the
semantics of <code class="highlighter-rouge">NULL</code> values handling in various operators, expressions and
other <code class="highlighter-rouge">SQL</code> constructs.</p>
<ol>
<li><a href="#comp-operators">Null handling in comparison operators</a></li>
<li><a href="#logical-operators">Null handling in Logical operators</a></li>
<li><a href="#expressions">Null handling in Expressions</a>
<ol>
<li><a href="#null-in-tolerant">Null handling in null-in-tolerant expressions</a></li>
<li><a href="#can-process-null">Null handling Expressions that can process null value operands</a></li>
<li><a href="#built-in-aggregate">Null handling in built-in aggregate expressions</a></li>
</ol>
</li>
<li><a href="#condition-expressions">Null handling in WHERE, HAVING and JOIN conditions</a></li>
<li><a href="#aggregate-operator">Null handling in GROUP BY and DISTINCT</a></li>
<li><a href="#order-by">Null handling in ORDER BY</a></li>
<li><a href="#set-operators">Null handling in UNION, INTERSECT, EXCEPT</a></li>
<li><a href="#exists-not-exists">Null handling in EXISTS and NOT EXISTS subquery</a></li>
<li><a href="#in-not-in">Null handling in IN and NOT IN subquery</a></li>
</ol>
<style type="text/css">
.tsclass {font-size:12px;color:#333333;width:40%;border-width: 2px;border-color: #729ea5;border-collapse: collapse;}
.tsclass th {text-align: left;}
</style>
<p>The following illustrates the schema layout and data of a table named <code class="highlighter-rouge">person</code>. The data contains <code class="highlighter-rouge">NULL</code> values in
the <code class="highlighter-rouge">age</code> column and this table will be used in various examples in the sections below.
<strong><u>TABLE: person</u></strong></p>
<table class="tsclass" border="1">
<tr><th>Id</th><th>Name</th><th>Age</th></tr>
<tr><td>100</td><td>Joe</td><td>30</td></tr>
<tr><td>200</td><td>Marry</td><td>NULL</td></tr>
<tr><td>300</td><td>Mike</td><td>18</td></tr>
<tr><td>400</td><td>Fred</td><td>50</td></tr>
<tr><td>500</td><td>Albert</td><td>NULL</td></tr>
<tr><td>600</td><td>Michelle</td><td>30</td></tr>
<tr><td>700</td><td>Dan</td><td>50</td></tr>
</table>
<h3 id="comparision-operators-">Comparision operators <a name="comp-operators"></a></h3>
<p>Apache spark supports the standard comparison operators such as &#8216;&gt;&#8217;, &#8216;&gt;=&#8217;, &#8216;=&#8217;, &#8216;&lt;&#8217; and &#8216;&lt;=&#8217;.
The result of these operators is unknown or <code class="highlighter-rouge">NULL</code> when one of the operarands or both the operands are
unknown or <code class="highlighter-rouge">NULL</code>. In order to compare the <code class="highlighter-rouge">NULL</code> values for equality, Spark provides a null-safe
equal operator (&#8216;&lt;=&gt;&#8217;), which returns <code class="highlighter-rouge">False</code> when one of the operand is <code class="highlighter-rouge">NULL</code> and returns &#8216;True<code class="highlighter-rouge"> when
both the operands are </code>NULL<code class="highlighter-rouge">. The following table illustrates the behaviour of comparison operators when
one or both operands are </code>NULL`:</p>
<table class="tsclass" border="1">
<tr>
<th>Left Operand</th>
<th>Right Operand</th>
<th>&gt;</th>
<th>&gt;=</th>
<th>=</th>
<th>&lt;</th>
<th>&lt;=</th>
<th>&lt;=&gt;</th>
</tr>
<tr>
<td>NULL</td>
<td>Any value</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>False</td>
</tr>
<tr>
<td>Any value</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>False</td>
</tr>
<tr>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>True</td>
</tr>
</table>
<h3 id="examples">Examples</h3>
<figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="c1">-- Normal comparison operators return `NULL` when one of the operand is `NULL`.</span>
<span class="k">SELECT</span> <span class="mi">5</span> <span class="o">&gt;</span> <span class="k">null</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="k">null</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="c1">-- Normal comparison operators return `NULL` when both the operands are `NULL`.</span>
<span class="k">SELECT</span> <span class="k">null</span> <span class="o">=</span> <span class="k">null</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="k">null</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="c1">-- Null-safe equal operator return `False` when one of the operand is `NULL`</span>
<span class="k">SELECT</span> <span class="mi">5</span> <span class="o">&lt;=&gt;</span> <span class="k">null</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="k">false</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="c1">-- Null-safe equal operator return `True` when one of the operand is `NULL`</span>
<span class="k">SELECT</span> <span class="k">NULL</span> <span class="o">&lt;=&gt;</span> <span class="k">NULL</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="k">true</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span></code></pre></figure>
<h3 id="logical-operators-">Logical operators <a name="logical-operators"></a></h3>
<p>Spark supports standard logical operators such as <code class="highlighter-rouge">AND</code>, <code class="highlighter-rouge">OR</code> and <code class="highlighter-rouge">NOT</code>. These operators take <code class="highlighter-rouge">Boolean</code> expressions
as the arguments and return a <code class="highlighter-rouge">Boolean</code> value.</p>
<p>The following tables illustrate the behavior of logical opeators when one or both operands are <code class="highlighter-rouge">NULL</code>.</p>
<table class="tsclass" border="1">
<tr>
<th>Left Operand</th>
<th>Right Operand</th>
<th>OR</th>
<th>AND</th>
</tr>
<tr>
<td>True</td>
<td>NULL</td>
<td>True</td>
<td>NULL</td>
</tr>
<tr>
<td>False</td>
<td>NULL</td>
<td>NULL</td>
<td>False</td>
</tr>
<tr>
<td>NULL</td>
<td>True</td>
<td>True</td>
<td>NULL</td>
</tr>
<tr>
<td>NULL</td>
<td>False</td>
<td>NULL</td>
<td>NULL</td>
</tr>
<tr>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
<td>NULL</td>
</tr>
</table>
<p><br /></p>
<table class="tsclass" border="1">
<tr>
<th>operand</th>
<th>NOT</th>
</tr>
<tr>
<td>NULL</td>
<td>NULL</td>
</tr>
</table>
<h3 id="examples-1">Examples</h3>
<figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="c1">-- Normal comparison operators return `NULL` when one of the operands is `NULL`.</span>
<span class="k">SELECT</span> <span class="p">(</span><span class="k">true</span> <span class="k">OR</span> <span class="k">null</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="k">true</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="c1">-- Normal comparison operators return `NULL` when both the operands are `NULL`.</span>
<span class="k">SELECT</span> <span class="p">(</span><span class="k">null</span> <span class="k">OR</span> <span class="k">false</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="k">null</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="c1">-- Null-safe equal operator returns `False` when one of the operands is `NULL`</span>
<span class="k">SELECT</span> <span class="k">NOT</span><span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="k">null</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span></code></pre></figure>
<h3 id="expressions-">Expressions <a name="expressions"></a></h3>
<p>The comparison operators and logical operators are treated as expressions in
Spark. Other than these two kinds of expressions, Spark supports other form of
expressions such as function expressions, cast expressions, etc. The expressions
in Spark can be broadly classified as :</p>
<ul>
<li>Null in-tolerent expressions</li>
<li>Expressions that can process <code class="highlighter-rouge">NULL</code> value operands
<ul>
<li>The result of these expressions depends on the expression itself.</li>
</ul>
</li>
</ul>
<h4 id="null-in-tolerant-expressions-">Null in-tolerant expressions <a name="null-in-tolerant"></a></h4>
<p>Null in-tolerant expressions return <code class="highlighter-rouge">NULL</code> when one or more arguments of
expression are <code class="highlighter-rouge">NULL</code> and most of the expressions fall in this category.</p>
<h5 id="examples-2">Examples</h5>
<figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="k">SELECT</span> <span class="n">concat</span><span class="p">(</span><span class="s1">'John'</span><span class="p">,</span> <span class="k">null</span><span class="p">)</span> <span class="k">as</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="k">null</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="k">SELECT</span> <span class="n">positive</span><span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="k">as</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="k">null</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="k">SELECT</span> <span class="n">to_date</span><span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="k">as</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="k">null</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span></code></pre></figure>
<h4 id="expressions-that-can-process-null-value-operands-">Expressions that can process null value operands. <a name="can-process-null"></a></h4>
<p>This class of expressions are designed to handle <code class="highlighter-rouge">NULL</code> values. The result of the
expressions depends on the expression itself. As an example, function expression <code class="highlighter-rouge">isnull</code>
returns a <code class="highlighter-rouge">true</code> on null input and <code class="highlighter-rouge">false</code> on non null input where as function <code class="highlighter-rouge">coalesce</code>
returns the first non <code class="highlighter-rouge">NULL</code> value in its list of operands. However, <code class="highlighter-rouge">coalesce</code> returns
<code class="highlighter-rouge">NULL</code> when all its operands are <code class="highlighter-rouge">NULL</code>. Below is an incomplete list of expressions of this category.</p>
<ul>
<li>COALESCE</li>
<li>NULLIF</li>
<li>IFNULL</li>
<li>NVL</li>
<li>NVL2</li>
<li>ISNAN</li>
<li>NANVL</li>
<li>ISNULL</li>
<li>ISNOTNULL</li>
<li>ATLEASTNNONNULLS</li>
<li>IN</li>
</ul>
<h5 id="examples-3">Examples</h5>
<figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="k">SELECT</span> <span class="k">isnull</span><span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="k">true</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="c1">-- Returns the first occurence of non `NULL` value.</span>
<span class="k">SELECT</span> <span class="n">coalesce</span><span class="p">(</span><span class="k">null</span><span class="p">,</span> <span class="k">null</span><span class="p">,</span> <span class="mi">3</span><span class="p">,</span> <span class="k">null</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="mi">3</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="c1">-- Returns `NULL` as all its operands are `NULL`. </span>
<span class="k">SELECT</span> <span class="n">coalesce</span><span class="p">(</span><span class="k">null</span><span class="p">,</span> <span class="k">null</span><span class="p">,</span> <span class="k">null</span><span class="p">,</span> <span class="k">null</span><span class="p">)</span> <span class="k">AS</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="k">null</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="k">SELECT</span> <span class="n">isnan</span><span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="k">as</span> <span class="n">expression_output</span><span class="p">;</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="n">expression_output</span><span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span>
<span class="o">|</span><span class="k">false</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">-----------------+</span></code></pre></figure>
<h4 id="builtin-aggregate-expressions-">Builtin Aggregate Expressions <a name="built-in-aggregate"></a></h4>
<p>Aggregate functions compute a single result by processing a set of input rows. Below are
the rules of how <code class="highlighter-rouge">NULL</code> values are handled by aggregate functions.</p>
<ul>
<li><code class="highlighter-rouge">NULL</code> values are ignored from processing by all the aggregate functions.
<ul>
<li>Only exception to this rule is COUNT(*) function.</li>
</ul>
</li>
<li>Some aggregate functions return <code class="highlighter-rouge">NULL</code> when all input values are <code class="highlighter-rouge">NULL</code> or the input data set
is empty.<br /> The list of these functions is:
<ul>
<li>MAX</li>
<li>MIN</li>
<li>SUM</li>
<li>AVG</li>
<li>EVERY</li>
<li>ANY</li>
<li>SOME</li>
</ul>
</li>
</ul>
<h4 id="examples-4">Examples</h4>
<figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="c1">-- `count(*)` does not skip `NULL` values.</span>
<span class="k">SELECT</span> <span class="k">count</span><span class="p">(</span><span class="o">*</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">person</span><span class="p">;</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="o">|</span><span class="k">count</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="o">|</span><span class="mi">7</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="c1">-- `NULL` values in column `age` are skipped from processing.</span>
<span class="k">SELECT</span> <span class="k">count</span><span class="p">(</span><span class="n">age</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">person</span><span class="p">;</span>
<span class="o">+</span><span class="c1">----------+</span>
<span class="o">|</span><span class="k">count</span><span class="p">(</span><span class="n">age</span><span class="p">)</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----------+</span>
<span class="o">|</span><span class="mi">5</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">----------+</span>
<span class="c1">-- `count(*)` on an empty input set returns 0. This is unlike the other</span>
<span class="c1">-- aggregate functions, such as `max`, which return `NULL`.</span>
<span class="k">SELECT</span> <span class="k">count</span><span class="p">(</span><span class="o">*</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">where</span> <span class="mi">1</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="o">|</span><span class="k">count</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="o">|</span><span class="mi">0</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="c1">-- `NULL` values are excluded from computation of maximum value.</span>
<span class="k">SELECT</span> <span class="k">max</span><span class="p">(</span><span class="n">age</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">person</span><span class="p">;</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="o">|</span><span class="k">max</span><span class="p">(</span><span class="n">age</span><span class="p">)</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="o">|</span><span class="mi">50</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="c1">-- `max` returns `NULL` on an empty input set.</span>
<span class="k">SELECT</span> <span class="k">max</span><span class="p">(</span><span class="n">age</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">where</span> <span class="mi">1</span> <span class="o">=</span> <span class="mi">0</span><span class="p">;</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="o">|</span><span class="k">max</span><span class="p">(</span><span class="n">age</span><span class="p">)</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+</span>
<span class="o">|</span><span class="k">null</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">--------+</span></code></pre></figure>
<h3 id="condition-expressions-in-where-having-and-join-clauses-">Condition expressions in WHERE, HAVING and JOIN clauses. <a name="condition-expressions"></a></h3>
<p><code class="highlighter-rouge">WHERE</code>, <code class="highlighter-rouge">HAVING</code> operators filter rows based on the user specified condition.
A <code class="highlighter-rouge">JOIN</code> operator is used to combine rows from two tables based on a join condition.
For all the three operators, a condition expression is a boolean expression and can return
<code>True, False or Unknown (NULL)</code>. They are &#8220;satisfied&#8221; if the result of the condition is <code class="highlighter-rouge">True</code>.</p>
<h4 id="examples-5">Examples</h4>
<figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="c1">-- Persons whose age is unknown (`NULL`) are filtered out from the result set.</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="n">age</span> <span class="o">&gt;</span> <span class="mi">0</span><span class="p">;</span>
<span class="o">+</span><span class="c1">--------+---+</span>
<span class="o">|</span><span class="n">name</span> <span class="o">|</span><span class="n">age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+---+</span>
<span class="o">|</span><span class="n">Michelle</span><span class="o">|</span><span class="mi">30</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Fred</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Mike</span> <span class="o">|</span><span class="mi">18</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Dan</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Joe</span> <span class="o">|</span><span class="mi">30</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">--------+---+</span>
<span class="c1">-- `IS NULL` expression is used in disjunction to select the persons</span>
<span class="c1">-- with unknown (`NULL`) records.</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="n">age</span> <span class="o">&gt;</span> <span class="mi">0</span> <span class="k">OR</span> <span class="n">age</span> <span class="k">IS</span> <span class="k">NULL</span><span class="p">;</span>
<span class="o">+</span><span class="c1">--------+----+</span>
<span class="o">|</span><span class="n">name</span> <span class="o">|</span><span class="n">age</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">--------+----+</span>
<span class="o">|</span><span class="n">Albert</span> <span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span><span class="n">Michelle</span><span class="o">|</span><span class="mi">30</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Fred</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Mike</span> <span class="o">|</span><span class="mi">18</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Dan</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Marry</span> <span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span><span class="n">Joe</span> <span class="o">|</span><span class="mi">30</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">--------+----+</span>
<span class="c1">-- Person with unknown(`NULL`) ages are skipped from processing.</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">GROUP</span> <span class="k">BY</span> <span class="n">age</span> <span class="k">HAVING</span> <span class="k">max</span><span class="p">(</span><span class="n">age</span><span class="p">)</span> <span class="o">&gt;</span> <span class="mi">18</span><span class="p">;</span>
<span class="o">+</span><span class="c1">---+--------+ </span>
<span class="o">|</span><span class="n">age</span><span class="o">|</span><span class="k">count</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">|</span>
<span class="o">+</span><span class="c1">---+--------+</span>
<span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="mi">2</span> <span class="o">|</span>
<span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="mi">2</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">---+--------+</span>
<span class="c1">-- A self join case with a join condition `p1.age = p2.age AND p1.name = p2.name`.</span>
<span class="c1">-- The persons with unknown age (`NULL`) are filtered out by the join operator.</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="n">p1</span><span class="p">,</span> <span class="n">person</span> <span class="n">p2</span>
<span class="k">WHERE</span> <span class="n">p1</span><span class="p">.</span><span class="n">age</span> <span class="o">=</span> <span class="n">p2</span><span class="p">.</span><span class="n">age</span>
<span class="k">AND</span> <span class="n">p1</span><span class="p">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">p2</span><span class="p">.</span><span class="n">name</span><span class="p">;</span>
<span class="o">+</span><span class="c1">--------+---+--------+---+</span>
<span class="o">|</span><span class="n">name</span> <span class="o">|</span><span class="n">age</span><span class="o">|</span><span class="n">name</span> <span class="o">|</span><span class="n">age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+---+--------+---+</span>
<span class="o">|</span><span class="n">Michelle</span><span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="n">Michelle</span><span class="o">|</span><span class="mi">30</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Fred</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="n">Fred</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Mike</span> <span class="o">|</span><span class="mi">18</span> <span class="o">|</span><span class="n">Mike</span> <span class="o">|</span><span class="mi">18</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Dan</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="n">Dan</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Joe</span> <span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="n">Joe</span> <span class="o">|</span><span class="mi">30</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">--------+---+--------+---+</span>
<span class="c1">-- The age column from both legs of join are compared using null-safe equal which</span>
<span class="c1">-- is why the persons with unknown age (`NULL`) are qualified by the join.</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="n">p1</span><span class="p">,</span> <span class="n">person</span> <span class="n">p2</span>
<span class="k">WHERE</span> <span class="n">p1</span><span class="p">.</span><span class="n">age</span> <span class="o">&lt;=&gt;</span> <span class="n">p2</span><span class="p">.</span><span class="n">age</span>
<span class="k">AND</span> <span class="n">p1</span><span class="p">.</span><span class="n">name</span> <span class="o">=</span> <span class="n">p2</span><span class="p">.</span><span class="n">name</span><span class="p">;</span>
<span class="o">+</span><span class="c1">--------+----+--------+----+</span>
<span class="o">|</span> <span class="n">name</span><span class="o">|</span> <span class="n">age</span><span class="o">|</span> <span class="n">name</span><span class="o">|</span> <span class="n">age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+----+--------+----+</span>
<span class="o">|</span> <span class="n">Albert</span><span class="o">|</span><span class="k">null</span><span class="o">|</span> <span class="n">Albert</span><span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span><span class="n">Michelle</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span><span class="n">Michelle</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Fred</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span> <span class="n">Fred</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Mike</span><span class="o">|</span> <span class="mi">18</span><span class="o">|</span> <span class="n">Mike</span><span class="o">|</span> <span class="mi">18</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Dan</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span> <span class="n">Dan</span><span class="o">|</span> <span class="mi">50</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Marry</span><span class="o">|</span><span class="k">null</span><span class="o">|</span> <span class="n">Marry</span><span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span> <span class="n">Joe</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span> <span class="n">Joe</span><span class="o">|</span> <span class="mi">30</span><span class="o">|</span>
<span class="o">+</span><span class="c1">--------+----+--------+----+</span></code></pre></figure>
<h3 id="aggregate-operator-group-by-distinct-">Aggregate operator (GROUP BY, DISTINCT) <a name="aggregate-operator"></a></h3>
<p>As discussed in the previous section <a href="sql-ref-null-semantics.html#comparision-operators">comparison operator</a>,
two <code class="highlighter-rouge">NULL</code> values are not equal. However, for the purpose of grouping and distinct processing, the two or more
values with <code class="highlighter-rouge">NULL data</code>are grouped together into the same bucket. This behaviour is conformant with SQL
standard and with other enterprise database management systems.</p>
<h4 id="examples-6">Examples</h4>
<figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="c1">-- `NULL` values are put in one bucket in `GROUP BY` processing.</span>
<span class="k">SELECT</span> <span class="n">age</span><span class="p">,</span> <span class="k">count</span><span class="p">(</span><span class="o">*</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">GROUP</span> <span class="k">BY</span> <span class="n">age</span><span class="p">;</span>
<span class="o">+</span><span class="c1">----+--------+ </span>
<span class="o">|</span><span class="n">age</span> <span class="o">|</span><span class="k">count</span><span class="p">(</span><span class="mi">1</span><span class="p">)</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="o">|</span><span class="k">null</span><span class="o">|</span><span class="mi">2</span> <span class="o">|</span>
<span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="mi">2</span> <span class="o">|</span>
<span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="mi">2</span> <span class="o">|</span>
<span class="o">|</span><span class="mi">18</span> <span class="o">|</span><span class="mi">1</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="c1">-- All `NULL` ages are considered one distinct value in `DISTINCT` processing.</span>
<span class="k">SELECT</span> <span class="k">DISTINCT</span> <span class="n">age</span> <span class="k">FROM</span> <span class="n">person</span><span class="p">;</span>
<span class="o">+</span><span class="c1">----+</span>
<span class="o">|</span><span class="n">age</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">----+</span>
<span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span><span class="mi">50</span> <span class="o">|</span>
<span class="o">|</span><span class="mi">30</span> <span class="o">|</span>
<span class="o">|</span><span class="mi">18</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">----+</span></code></pre></figure>
<h3 id="sort-operator-order-by-clause-">Sort operator (ORDER BY Clause) <a name="order-by"></a></h3>
<p>Spark SQL supports null ordering specification in <code class="highlighter-rouge">ORDER BY</code> clause. Spark processes the <code class="highlighter-rouge">ORDER BY</code> clause by
placing all the <code class="highlighter-rouge">NULL</code> values at first or at last depending on the null ordering specification. By default, all
the <code class="highlighter-rouge">NULL</code> values are placed at first.</p>
<h4 id="examples-7">Examples</h4>
<figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="c1">-- `NULL` values are shown at first and other values</span>
<span class="c1">-- are sorted in ascending way.</span>
<span class="k">SELECT</span> <span class="n">age</span><span class="p">,</span> <span class="n">name</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">ORDER</span> <span class="k">BY</span> <span class="n">age</span><span class="p">;</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="o">|</span><span class="n">age</span> <span class="o">|</span><span class="n">name</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="o">|</span><span class="k">null</span><span class="o">|</span><span class="n">Marry</span> <span class="o">|</span>
<span class="o">|</span><span class="k">null</span><span class="o">|</span><span class="n">Albert</span> <span class="o">|</span>
<span class="o">|</span><span class="mi">18</span> <span class="o">|</span><span class="n">Mike</span> <span class="o">|</span>
<span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="n">Michelle</span><span class="o">|</span>
<span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="n">Joe</span> <span class="o">|</span>
<span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="n">Fred</span> <span class="o">|</span>
<span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="n">Dan</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="c1">-- Column values other than `NULL` are sorted in ascending</span>
<span class="c1">-- way and `NULL` values are shown at the last.</span>
<span class="k">SELECT</span> <span class="n">age</span><span class="p">,</span> <span class="n">name</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">ORDER</span> <span class="k">BY</span> <span class="n">age</span> <span class="n">NULLS</span> <span class="k">LAST</span><span class="p">;</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="o">|</span><span class="n">age</span> <span class="o">|</span><span class="n">name</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="o">|</span><span class="mi">18</span> <span class="o">|</span><span class="n">Mike</span> <span class="o">|</span>
<span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="n">Michelle</span><span class="o">|</span>
<span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="n">Joe</span> <span class="o">|</span>
<span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="n">Dan</span> <span class="o">|</span>
<span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="n">Fred</span> <span class="o">|</span>
<span class="o">|</span><span class="k">null</span><span class="o">|</span><span class="n">Marry</span> <span class="o">|</span>
<span class="o">|</span><span class="k">null</span><span class="o">|</span><span class="n">Albert</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="c1">-- Columns other than `NULL` values are sorted in descending</span>
<span class="c1">-- and `NULL` values are shown at the last.</span>
<span class="k">SELECT</span> <span class="n">age</span><span class="p">,</span> <span class="n">name</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">ORDER</span> <span class="k">BY</span> <span class="n">age</span> <span class="k">DESC</span> <span class="n">NULLS</span> <span class="k">LAST</span><span class="p">;</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="o">|</span><span class="n">age</span> <span class="o">|</span><span class="n">name</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">----+--------+</span>
<span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="n">Fred</span> <span class="o">|</span>
<span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="n">Dan</span> <span class="o">|</span>
<span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="n">Michelle</span><span class="o">|</span>
<span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="n">Joe</span> <span class="o">|</span>
<span class="o">|</span><span class="mi">18</span> <span class="o">|</span><span class="n">Mike</span> <span class="o">|</span>
<span class="o">|</span><span class="k">null</span><span class="o">|</span><span class="n">Marry</span> <span class="o">|</span>
<span class="o">|</span><span class="k">null</span><span class="o">|</span><span class="n">Albert</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">----+--------+</span></code></pre></figure>
<h3 id="set-operators-union-intersect-except-">Set operators (UNION, INTERSECT, EXCEPT) <a name="set-operators"></a></h3>
<p><code class="highlighter-rouge">NULL</code> values are compared in a null-safe manner for equality in the context of
set operations. That means when comparing rows, two <code class="highlighter-rouge">NULL</code> values are considered
equal unlike the regular <code class="highlighter-rouge">EqualTo</code>(<code class="highlighter-rouge">=</code>) operator.</p>
<h4 id="examples-8">Examples</h4>
<figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="k">CREATE</span> <span class="k">VIEW</span> <span class="n">unknown_age</span> <span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="n">age</span> <span class="k">IS</span> <span class="k">NULL</span><span class="p">;</span>
<span class="c1">-- Only common rows between two legs of `INTERSECT` are in the </span>
<span class="c1">-- result set. The comparison between columns of the row are done</span>
<span class="c1">-- in a null-safe manner.</span>
<span class="k">SELECT</span> <span class="n">name</span><span class="p">,</span> <span class="n">age</span> <span class="k">FROM</span> <span class="n">person</span>
<span class="k">INTERSECT</span>
<span class="k">SELECT</span> <span class="n">name</span><span class="p">,</span> <span class="n">age</span> <span class="k">from</span> <span class="n">unknown_age</span><span class="p">;</span>
<span class="o">+</span><span class="c1">------+----+ </span>
<span class="o">|</span><span class="n">name</span> <span class="o">|</span><span class="n">age</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">------+----+</span>
<span class="o">|</span><span class="n">Albert</span><span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span><span class="n">Marry</span> <span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">+</span><span class="c1">------+----+</span>
<span class="c1">-- `NULL` values from two legs of the `EXCEPT` are not in output. </span>
<span class="c1">-- This basically shows that the comparison happens in a null-safe manner.</span>
<span class="k">SELECT</span> <span class="n">age</span><span class="p">,</span> <span class="n">name</span> <span class="k">FROM</span> <span class="n">person</span>
<span class="k">EXCEPT</span>
<span class="k">SELECT</span> <span class="n">age</span> <span class="k">FROM</span> <span class="n">unknown_age</span><span class="p">;</span>
<span class="o">+</span><span class="c1">---+--------+ </span>
<span class="o">|</span><span class="n">age</span><span class="o">|</span><span class="n">name</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">---+--------+</span>
<span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="n">Joe</span> <span class="o">|</span>
<span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="n">Fred</span> <span class="o">|</span>
<span class="o">|</span><span class="mi">30</span> <span class="o">|</span><span class="n">Michelle</span><span class="o">|</span>
<span class="o">|</span><span class="mi">18</span> <span class="o">|</span><span class="n">Mike</span> <span class="o">|</span>
<span class="o">|</span><span class="mi">50</span> <span class="o">|</span><span class="n">Dan</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">---+--------+</span>
<span class="c1">-- Performs `UNION` operation between two sets of data. </span>
<span class="c1">-- The comparison between columns of the row ae done in</span>
<span class="c1">-- null-safe manner.</span>
<span class="k">SELECT</span> <span class="n">name</span><span class="p">,</span> <span class="n">age</span> <span class="k">FROM</span> <span class="n">person</span>
<span class="k">UNION</span>
<span class="k">SELECT</span> <span class="n">name</span><span class="p">,</span> <span class="n">age</span> <span class="k">FROM</span> <span class="n">unknown_age</span><span class="p">;</span>
<span class="o">+</span><span class="c1">--------+----+ </span>
<span class="o">|</span><span class="n">name</span> <span class="o">|</span><span class="n">age</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">--------+----+</span>
<span class="o">|</span><span class="n">Albert</span> <span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span><span class="n">Joe</span> <span class="o">|</span><span class="mi">30</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Michelle</span><span class="o">|</span><span class="mi">30</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Marry</span> <span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span><span class="n">Fred</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Mike</span> <span class="o">|</span><span class="mi">18</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Dan</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">--------+----+</span></code></pre></figure>
<h3 id="existsnot-exists-subquery-">EXISTS/NOT EXISTS Subquery <a name="exists-not-exists"></a></h3>
<p>In Spark, EXISTS and NOT EXISTS expressions are allowed inside a WHERE clause.
These are boolean expressions which return either <code class="highlighter-rouge">TRUE</code> or
<code class="highlighter-rouge">FALSE</code>. In other words, EXISTS is a membership condition and returns <code class="highlighter-rouge">TRUE</code>
when the subquery it refers to returns one or more rows. Similary, NOT EXISTS
is a non-membership condition and returns TRUE when no rows or zero rows are
returned from the subquery.</p>
<p>These two expressions are not affected by presence of NULL in the result of
the subquery. They are normally faster because they can be converted to
semijoins / anti-semijoins without special provisions for null awareness.</p>
<h4 id="examples-9">Examples</h4>
<figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="c1">-- Even if subquery produces rows with `NULL` values, the `EXISTS` expression</span>
<span class="c1">-- evaluates to `TRUE` as the subquery produces 1 row.</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="k">EXISTS</span> <span class="p">(</span><span class="k">SELECT</span> <span class="k">null</span><span class="p">);</span>
<span class="o">+</span><span class="c1">--------+----+ </span>
<span class="o">|</span><span class="n">name</span> <span class="o">|</span><span class="n">age</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">--------+----+</span>
<span class="o">|</span><span class="n">Albert</span> <span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span><span class="n">Michelle</span><span class="o">|</span><span class="mi">30</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Fred</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Mike</span> <span class="o">|</span><span class="mi">18</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Dan</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Marry</span> <span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span><span class="n">Joe</span> <span class="o">|</span><span class="mi">30</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">--------+----+</span>
<span class="c1">-- `NOT EXISTS` expression returns `FALSE`. It returns `TRUE` only when</span>
<span class="c1">-- subquery produces no rows. In this case, it returns 1 row.</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="k">NOT</span> <span class="k">EXISTS</span> <span class="p">(</span><span class="k">SELECT</span> <span class="k">null</span><span class="p">);</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="o">|</span><span class="n">name</span><span class="o">|</span><span class="n">age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="c1">-- `NOT EXISTS` expression returns `TRUE`.</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="k">NOT</span> <span class="k">EXISTS</span> <span class="p">(</span><span class="k">SELECT</span> <span class="mi">1</span> <span class="k">WHERE</span> <span class="mi">1</span> <span class="o">=</span> <span class="mi">0</span><span class="p">);</span>
<span class="o">+</span><span class="c1">--------+----+</span>
<span class="o">|</span><span class="n">name</span> <span class="o">|</span><span class="n">age</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">--------+----+</span>
<span class="o">|</span><span class="n">Albert</span> <span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span><span class="n">Michelle</span><span class="o">|</span><span class="mi">30</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Fred</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Mike</span> <span class="o">|</span><span class="mi">18</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Dan</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Marry</span> <span class="o">|</span><span class="k">null</span><span class="o">|</span>
<span class="o">|</span><span class="n">Joe</span> <span class="o">|</span><span class="mi">30</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">--------+----+</span></code></pre></figure>
<h3 id="innot-in-subquery-">IN/NOT IN Subquery <a name="in-not-in"></a></h3>
<p>In Spark, <code class="highlighter-rouge">IN</code> and <code class="highlighter-rouge">NOT IN</code> expressions are allowed inside a WHERE clause of
a query. Unlike the <code class="highlighter-rouge">EXISTS</code> expression, <code class="highlighter-rouge">IN</code> expression can return a <code class="highlighter-rouge">TRUE</code>,
<code class="highlighter-rouge">FALSE</code> or <code class="highlighter-rouge">UNKNOWN (NULL)</code> value. Conceptually a <code class="highlighter-rouge">IN</code> expression is semantically
equivalent to a set of equality condition separated by a disjunctive operator (<code class="highlighter-rouge">OR</code>).
For example, c1 IN (1, 2, 3) is semantically equivalent to <code class="highlighter-rouge">(C1 = 1 OR c1 = 2 OR c1 = 3)</code>.</p>
<p>As far as handling <code class="highlighter-rouge">NULL</code> values are concerned, the semantics can be deduced from
the <code class="highlighter-rouge">NULL</code> value handling in comparison operators(<code class="highlighter-rouge">=</code>) and logical operators(<code class="highlighter-rouge">OR</code>).
To summarize, below are the rules for computing the result of an <code class="highlighter-rouge">IN</code> expression.</p>
<ul>
<li>TRUE is returned when the non-NULL value in question is found in the list</li>
<li>FALSE is returned when the non-NULL value is not found in the list and the
list does not contain NULL values</li>
<li>UNKNOWN is returned when the value is <code class="highlighter-rouge">NULL</code>, or the non-NULL value is not found in the list
and the list contains at least one <code class="highlighter-rouge">NULL</code> value</li>
</ul>
<p>NOT IN always returns UNKNOWN when the list contains <code class="highlighter-rouge">NULL</code>, regardless of the input value.
This is because IN returns UNKNOWN if the value is not in the list containing <code class="highlighter-rouge">NULL</code>,
and because NOT UNKNOWN is again UNKNOWN.</p>
<h4 id="examples-10">Examples</h4>
<figure class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="c1">-- The subquery has only `NULL` value in its result set. Therefore,</span>
<span class="c1">-- the result of `IN` predicate is UNKNOWN.</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span> <span class="k">WHERE</span> <span class="n">age</span> <span class="k">IN</span> <span class="p">(</span><span class="k">SELECT</span> <span class="k">null</span><span class="p">);</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="o">|</span><span class="n">name</span><span class="o">|</span><span class="n">age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="c1">-- The subquery has `NULL` value in the result set as well as a valid </span>
<span class="c1">-- value `50`. Rows with age = 50 are returned. </span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span>
<span class="k">WHERE</span> <span class="n">age</span> <span class="k">IN</span> <span class="p">(</span><span class="k">SELECT</span> <span class="n">age</span> <span class="k">FROM</span> <span class="k">VALUES</span> <span class="p">(</span><span class="mi">50</span><span class="p">),</span> <span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="n">sub</span><span class="p">(</span><span class="n">age</span><span class="p">));</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="o">|</span><span class="n">name</span><span class="o">|</span><span class="n">age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="o">|</span><span class="n">Fred</span><span class="o">|</span><span class="mi">50</span> <span class="o">|</span>
<span class="o">|</span><span class="n">Dan</span> <span class="o">|</span><span class="mi">50</span> <span class="o">|</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="c1">-- Since subquery has `NULL` value in the result set, the `NOT IN`</span>
<span class="c1">-- predicate would return UNKNOWN. Hence, no rows are</span>
<span class="c1">-- qualified for this query.</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">person</span>
<span class="k">WHERE</span> <span class="n">age</span> <span class="k">NOT</span> <span class="k">IN</span> <span class="p">(</span><span class="k">SELECT</span> <span class="n">age</span> <span class="k">FROM</span> <span class="k">VALUES</span> <span class="p">(</span><span class="mi">50</span><span class="p">),</span> <span class="p">(</span><span class="k">null</span><span class="p">)</span> <span class="n">sub</span><span class="p">(</span><span class="n">age</span><span class="p">));</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="o">|</span><span class="n">name</span><span class="o">|</span><span class="n">age</span><span class="o">|</span>
<span class="o">+</span><span class="c1">----+---+</span>
<span class="o">+</span><span class="c1">----+---+</span></code></pre></figure>
</div>
<!-- /container -->
</div>
<script src="js/vendor/jquery-3.4.1.min.js"></script>
<script src="js/vendor/bootstrap.min.js"></script>
<script src="js/vendor/anchor.min.js"></script>
<script src="js/main.js"></script>
<!-- MathJax Section -->
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
TeX: { equationNumbers: { autoNumber: "AMS" } }
});
</script>
<script>
// Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
// We could use "//cdn.mathjax...", but that won't support "file://".
(function(d, script) {
script = d.createElement('script');
script.type = 'text/javascript';
script.async = true;
script.onload = function(){
MathJax.Hub.Config({
tex2jax: {
inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
processEscapes: true,
skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
}
});
};
script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' +
'?config=TeX-AMS-MML_HTMLorMML';
d.getElementsByTagName('head')[0].appendChild(script);
}(document));
</script>
</body>
</html>